In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [None]:
global_temp=pd.read_csv('../input/global-temperature/GlobalTemperatures.csv')
global_temp.head()

In [None]:
def year(date):
    return date.split('-')[0]

In [None]:
global_temp['years']=global_temp['dt'].apply(year)
global_temp.head()

In [None]:
global_temp.info()

In [None]:
global_temp['years']=global_temp['years'].astype(str).astype(int)
global_temp.info()

In [None]:
data=global_temp.groupby('years').agg({'LandAverageTemperature':'mean','LandAverageTemperatureUncertainty':'mean'}).reset_index()
data.head()

In [None]:
data['Uncertainity_top']=data['LandAverageTemperature'] + data['LandAverageTemperatureUncertainty']
data['Uncertainity_bottom']=data['LandAverageTemperature'] - data['LandAverageTemperatureUncertainty']
data.head()

In [None]:
fig=px.line(data,x='years',y=['LandAverageTemperature','Uncertainity_top', 'Uncertainity_bottom']
            ,title='Avg Land Temp in World')
fig.show()

- From the charts we can see, that there is global warming nowadays. 
- The average temperature of Earth surface has the highest value in the last three centuries. 
- The fastest temperature growth occurred in the last 30 years! 
- This worries us, we hope soon humanity will fully switch to ecological sources of energy, that will reduce CO2.
- If it’s will not happened, we will be in disaster.
- This charts also have confidence intervals, which shows that measurement of temperature has become more accurate in the last few years.

In [None]:
global_temp['dt']=pd.to_datetime(global_temp['dt'])
global_temp.info()

In [None]:
global_temp['month']=global_temp['dt'].dt.month
global_temp.head()

In [None]:
def get_season(month):
    if month>=3 and month<=5:
        return 'spring'
    elif month>=6 and month<=8:
        return 'summer'
    elif month>=9 and month<=11:
        return 'autumn'
    else:
        return 'winter'

In [None]:
global_temp['season']=global_temp['month'].apply(get_season)
global_temp.head()

In [None]:
years=global_temp['years'].unique()
years[:5]

In [None]:
spring_temps=[]
summer_temps=[]
autumn_temps=[]
winter_temps=[]

In [None]:
for year in years:
    current_df=global_temp[global_temp['years']==year]
    spring_temps.append(current_df[current_df['season']=='spring']['LandAverageTemperature'].mean())
    summer_temps.append(current_df[current_df['season']=='summer']['LandAverageTemperature'].mean())
    autumn_temps.append(current_df[current_df['season']=='autumn']['LandAverageTemperature'].mean())
    winter_temps.append(current_df[current_df['season']=='winter']['LandAverageTemperature'].mean())

In [None]:
season=pd.DataFrame()
season.head()

In [None]:
season['year']=years
season['spring_temps']=spring_temps
season['summer_temps']=summer_temps
season['autumn_temps']=autumn_temps
season['winter_temps']=winter_temps
season.head()

In [None]:
fig=px.line(season,x='year',y=['spring_temps', 'summer_temps', 'autumn_temps', 'winter_temps'],title='Avg Temp in Each Season')
fig.show()

Is it getting warmer? Yes, it is.

In [None]:
cities=pd.read_csv('../input/global-temperature/GlobalLandTemperaturesByCity.csv')
cities.head()

In [None]:
cities.shape

In [None]:
cities.Country.value_counts()

In [None]:
ind=cities[cities['Country']=='India']
ind.head()

In [None]:
ind.City.unique()

In [None]:
data=['Jamshedpur']
jsr=ind[ind['City'].isin(data)]
jsr.head()

In [None]:
jsr=jsr[['dt','AverageTemperature']]
jsr.head()

In [None]:
jsr.info()

In [None]:
jsr.columns=['Date','Temp']
jsr.head()

In [None]:
jsr['Date']=pd.to_datetime(jsr['Date'])
jsr.isna().sum()

In [None]:
jsr.dropna(inplace=True)
jsr.shape

In [None]:
jsr.set_index('Date',inplace=True)
jsr.head()

In [None]:
plt.figure(figsize=(20, 5))
sns.lineplot(x=jsr.index,y=jsr['Temp'])
plt.grid()

In [None]:
from statsmodels.tsa.stattools import adfuller
test_result=adfuller(jsr['Temp'])
test_result

In [None]:
def adfuller_test(Temp):
    result=adfuller(Temp)
    labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']
    for value,label in zip(result,labels):
        print(label+' : '+str(value) )
    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis(Ho), reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")

In [None]:
adfuller_test(jsr['Temp'])

In [None]:
df=jsr.copy()
df.head()

In [None]:
df['first_temp_diff']=df['Temp']-df['Temp'].shift(12)
df.head(25)

In [None]:
adfuller_test(df['first_temp_diff'].dropna())

In [None]:
df[['first_temp_diff']].plot(figsize=(20,5))
plt.grid()

# Examine whether there is a seasonality factor in your data or not

In [None]:
jsr.head()

In [None]:
jsr['month']=jsr.index.month
jsr.head()

In [None]:
jsr['year']=jsr.index.year
jsr.head()

In [None]:
pivot=jsr.pivot_table(values='Temp',index='month',columns='year')
pivot

In [None]:
pivot.plot(figsize=(20,5))
plt.legend().remove()
plt.xlabel('Months')
plt.ylabel('Temperatures')
plt.grid()

Visualizing  this graph,we can say this data is seasonal The series clearly has some seasonality, the higher temperatures are around b/w June & August and the lower are between April & June so thats basically a observation that we can drawn and represent to our client

In [None]:
monthly_seasonality=pivot.mean(axis=1)
monthly_seasonality.head()

In [None]:
monthly_seasonality.plot(figsize=(20,5))
plt.grid()

In [None]:
df.head()

In [None]:
df=df[['first_temp_diff']]
df.head()

In [None]:
df.dropna(inplace=True)
df.head()

In [None]:
df['first_temp_diff'].rolling(window=7).mean()

In [None]:
value=pd.DataFrame(df['first_temp_diff'])
value

In [None]:
temp_df=pd.concat([value,df['first_temp_diff'].rolling(window=7).mean()],axis=1)
temp_df

In [None]:
temp_df.columns=['actual_temp','forecast_temp']
temp_df.head(20)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(temp_df['forecast_temp'][6:],temp_df['actual_temp'][6:]))

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
df.head()

In [None]:
plot_acf(df['first_temp_diff'].dropna())

In [None]:
plot_pacf(df['first_temp_diff'].dropna())

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
training_data=df[0:2400]
### create test that is basically unseen to our model, or which is basocally is used for evalaution
test_data=df[2400:]

In [None]:
training_data.head()

In [None]:
test_data.head()

In [None]:
from statsmodels.tsa.arima_model import ARIMA
arima=ARIMA(training_data,order=(2,1,3))

In [None]:
model=arima.fit()

In [None]:
predictions=model.forecast(steps=len(test_data))[0]
predictions[:5]

In [None]:
np.sqrt(mean_squared_error(test_data,predictions))

#### Model Tuning or Hyperparameter Tuning or choosing the best pair of (p,d,q)

In [None]:
p_values=range(0,4)
q_values=range(0,4)
d_values=range(0,4)

In [None]:
min_error=[]
for p in p_values:
    for d in d_values:
        for q in q_values:
            order=(p,d,q)
            train=df[0:2400]
            test=df[2400:]
            predictions=[]
            for i in range(len(test)):
                try:
                    arima=ARIMA(train,order)
                    model=arima.fit(disp=0)
                    pred=model.forecast()[0]
                    predictions.append(pred)
                    error=mean_squared_error(test,predictions)
                    print('MSE is {} with order {}'.format(error,order))
                    min_error.append(error)
                    print('min MSE is {} '.format(min(min_error)))
                except:
                    continue

Ideal choice for (p,d,q) is  (2, 2, 1) where
- p is the number of autoregressive terms,
- d is the number of nonseasonal differences needed for stationarity, and
- q is the number of lagged forecast errors in the prediction equation.

It has least MSE among all, i.e. 1.3563