In [1]:
# Necessary imports

import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Acquire the data

df = pd.read_csv('GlobalLandTemperaturesByState.csv')
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [3]:
# Check the shape

df.shape

(645675, 5)

In [4]:
# Make sure the dates are in datetime format

df.assign(ds = pd.to_datetime(df.dt))

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country,ds
0,1855-05-01,25.544,1.171,Acre,Brazil,1855-05-01
1,1855-06-01,24.228,1.103,Acre,Brazil,1855-06-01
2,1855-07-01,24.371,1.044,Acre,Brazil,1855-07-01
3,1855-08-01,25.427,1.073,Acre,Brazil,1855-08-01
4,1855-09-01,25.675,1.014,Acre,Brazil,1855-09-01
...,...,...,...,...,...,...
645670,2013-05-01,21.634,0.578,Zhejiang,China,2013-05-01
645671,2013-06-01,24.679,0.596,Zhejiang,China,2013-06-01
645672,2013-07-01,29.272,1.340,Zhejiang,China,2013-07-01
645673,2013-08-01,29.202,0.869,Zhejiang,China,2013-08-01


In [5]:
# Make date the index

df = df.set_index('dt').sort_index()

In [6]:
# Filter for Texas

tx_df = df[df['State'].str.contains("Texas")]
tx_df

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,State,Country
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1820-01-01,4.489,3.369,Texas,United States
1820-02-01,9.081,2.873,Texas,United States
1820-03-01,12.657,2.423,Texas,United States
1820-04-01,19.215,2.501,Texas,United States
1820-05-01,22.577,2.495,Texas,United States
...,...,...,...,...
2013-05-01,22.628,0.158,Texas,United States
2013-06-01,27.841,0.220,Texas,United States
2013-07-01,27.630,0.182,Texas,United States
2013-08-01,28.663,0.247,Texas,United States


In [7]:
# Check for nulls

tx_df.isnull().sum()

AverageTemperature               0
AverageTemperatureUncertainty    0
State                            0
Country                          0
dtype: int64

In [8]:
# Describe the data

tx_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AverageTemperature,2325.0,18.107234,7.413791,1.965,11.206,18.378,25.398,31.401
AverageTemperatureUncertainty,2325.0,0.794785,0.893291,0.044,0.18,0.31,1.241,4.95


### Takeaways
- much smaller dataset after filtering for only Texas
- no nulls
- converted temps from celcius to fahrenheit

In [9]:
# Converting the AvgTemp from celcius to fahrenheit

tx_df['AverageTemperature'] = (tx_df['AverageTemperature']*9/5)+32

In [10]:
# Check the converted data

tx_df

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,State,Country
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1820-01-01,40.0802,3.369,Texas,United States
1820-02-01,48.3458,2.873,Texas,United States
1820-03-01,54.7826,2.423,Texas,United States
1820-04-01,66.5870,2.501,Texas,United States
1820-05-01,72.6386,2.495,Texas,United States
...,...,...,...,...
2013-05-01,72.7304,0.158,Texas,United States
2013-06-01,82.1138,0.220,Texas,United States
2013-07-01,81.7340,0.182,Texas,United States
2013-08-01,83.5934,0.247,Texas,United States
