In [3]:
import pandas as pd
import numpy as np
import sklearn
import plotly.express as px 
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import set_printoptions
from mlxtend.evaluate import bias_variance_decomp
from plotly.subplots import make_subplots
from datetime import datetime ## Time Series analysis.
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

np.seterr(divide='ignore', invalid='ignore')

weather_df = pd.read_csv("weatherHistory.csv")
weather_df.isnull().sum()

####### Data Cleaning - Remove Null values from data ###################
weather_df.fillna('',inplace=True)
weather_df.isnull().sum()
weather_df.shape
print("Null Values removed from data")
####### Data Cleaning - Remove Duplicate Values from data #################
boolean = weather_df.duplicated().any()
if boolean:
    weather_df.drop_duplicates(inplace=True)
    weather_df.reset_index(drop=True, inplace=True)
    weather_df.describe().T
    print("Duplicate entries removed from data")
else:
    print("No Duplicate entries in data")

print("############## Data Preprocessing Starts ########################")

weather_df["Formatted Date"] = pd.to_datetime(weather_df["Formatted Date"], format = "%Y-%m-%d %H:%M:%S.%f %z") 

weather_df["year"]  = weather_df["Formatted Date"].apply(lambda x: x.year)
weather_df["month"] = weather_df["Formatted Date"].apply(lambda x: x.month)
weather_df["day"]   = weather_df["Formatted Date"].apply(lambda x: x.day)

print("Converting categorical data into features")


dms = pd.get_dummies(weather_df["Precip Type"])
weather_df = pd.concat([weather_df,dms[["rain","snow"]]],axis=1)


weather_df.drop(["Apparent Temperature (C)","Precip Type","Loud Cover", "Summary", "Daily Summary", "Formatted Date", "rain"],axis=1,inplace=True)
weather_df.rename(columns={"Temperature (C)": "temperature","Humidity":"humidity",
                            "Wind Speed (km/h)" : "wind_speed","Visibility (km)":"visibility"},inplace=True)

#ploting the triangle heatmap for correlation for the selected features
matrix = weather_df.corr().round(2)
mask = np.triu(np.ones_like(matrix, dtype=bool))
sns.heatmap(matrix, annot=True, vmax=1, vmin=-1, center=0, cmap='vlag', mask=mask)
plt.show()

# Feature Selection with Univariate Statistical Tests
print("Feature Selection")
X = weather_df.drop(columns = ["temperature"],axis=1)
Y = weather_df["temperature"]
# feature extraction
test = SelectKBest(score_func=f_regression, k=5)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
print(test.get_support())
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])
weather_df.head()

ImportError: ignored

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
  
  
# the independent variables set
X  = weather_df.copy()
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:


np.seterr(divide='ignore', invalid='ignore')

weather_df = pd.read_csv("weatherHistory.csv")
weather_df.isnull().sum()

####### Data Cleaning - Remove Null values from data ###################
weather_df.fillna('',inplace=True)
weather_df.isnull().sum()
weather_df.shape
print("Null Values removed from data")
####### Data Cleaning - Remove Duplicate Values from data #################
boolean = weather_df.duplicated().any()
if boolean:
    weather_df.drop_duplicates(inplace=True)
    weather_df.reset_index(drop=True, inplace=True)
    weather_df.describe().T
    print("Duplicate entries removed from data")
else:
    print("No Duplicate entries in data")

print("############## Data Preprocessing Starts ########################")

weather_df["Formatted Date"] = pd.to_datetime(weather_df["Formatted Date"], format = "%Y-%m-%d %H:%M:%S.%f %z") 

weather_df["year"]  = weather_df["Formatted Date"].apply(lambda x: x.year)
weather_df["month"] = weather_df["Formatted Date"].apply(lambda x: x.month)
weather_df["day"]   = weather_df["Formatted Date"].apply(lambda x: x.day)

print("Converting categorical data into features")


dms = pd.get_dummies(weather_df["Precip Type"])
weather_df = pd.concat([weather_df,dms[["rain","snow"]]],axis=1)


#weather_df.drop(["Summary","Daily Summary","Precip Type","Loud Cover","Apparent Temperature (C)", "rain", "day", "Wind Speed (km/h)", "Pressure (millibars)"],axis=1,inplace=True)
weather_df.drop(["Apparent Temperature (C)","Precip Type","Loud Cover", "Summary", "Daily Summary", "rain"],axis=1,inplace=True)
weather_df.rename(columns={"Formatted Date" : "date","Temperature (C)": "temperature","Humidity":"humidity", "Wind Speed (km/h)" : "wind_speed",
                            "Wind Bearing (degrees)" : "wind_bear", "Pressure (millibars)" : "pressure", "Visibility (km)":"visibility"},inplace=True)

#weather_df.corr()
#weather_df.isnull().sum()

# Feature Selection with Univariate Statistical Tests

X = weather_df.drop(columns = ["temperature", "date"],axis=1)
Y = weather_df["temperature"]
# feature extraction
test = SelectKBest(score_func=f_regression, k=5)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
print(test.get_support())
features = fit.transform(X)
# summarize selected features
print("Feature Selection, k = 5")
df = pd.DataFrame(weather_df.columns[2:11], test.get_support())
print(df)


print("############## Data Preprocessing Ends ########################")

print("############## Data Analysis Starts ########################")
weather_df.sort_values(by='date', inplace=True) ## To get the time series right.
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[-25, weather_df['temperature'].max()+1])))
fig.add_trace(go.Scatter(x=weather_df['date'], y=weather_df['temperature']), )
fig.update_layout(title='Temprature Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Temprature in Degrees')
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole View", step="all"),
                      dict(count=1,label="One Year View",step="year",stepmode="todate")                      
                     ])),
        rangeslider=dict(visible=True),type="date")
)
fig.show()

fig = px.box(weather_df, 'month', 'temperature', color= 'month')
fig.update_layout(title='temperature vs Month before removing outliers')
fig.show()

fig = px.box(weather_df, 'humidity', 'temperature')
fig.update_layout(title='temperature vs Humidity before removing outliers')
fig.show()

fig = px.box(weather_df, 'visibility', 'temperature')
fig.update_layout(title='temperature vs Visibility before removing outliers')
fig.show()

fig = px.box(weather_df, 'wind_bear', 'temperature')
fig.update_layout(title='temperature vs wind_bear before removing outliers')
fig.show()

weather_df.describe()

In [None]:
############ IQR ##############
outlier_features = ['temperature', 'humidity', "visibility", "wind_bear"]
print("IQR")
for i in range(len(outlier_features)):
    Q1 = 0.0
    Q3 = 0.0
    IQR = 0.0
    upper = []
    lower = []
    
    weather_df.reset_index(drop=True, inplace=True)
    
    Q1 = np.percentile(weather_df[outlier_features[i]], 25,
                       interpolation = 'midpoint')

    Q3 = np.percentile(weather_df[outlier_features[i]], 75,
                       interpolation = 'midpoint')
    IQR = Q3 - Q1
    print("Feature - " + str(outlier_features[i]) +", Upper Limit - " +str( Q3 + 1.5*IQR))
    print("Feature - " + str(outlier_features[i]) +", Lower Limit - " +str( Q1 - 1.5*IQR))
    
    # Upper bound
    upper = np.where(weather_df[outlier_features[i]] >= ( Q3 + 1.5*IQR ))
    # Lower bound
    lower = np.where(weather_df[outlier_features[i]] <= ( Q1 - 1.5*IQR ))
    
    # Removing the Outliers 
    print('Old Shape - ',weather_df.shape)
    weather_df.drop(upper[0], inplace = True)
    weather_df.drop(lower[0], inplace = True)
    print("New Shape - ", weather_df.shape)

weather_df.describe()
weather_df.reset_index()

weather_df.sort_values(by='date', inplace=True) ## To get the time series right.
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[-25, weather_df['temperature'].max()+1])))
fig.add_trace(go.Scatter(x=weather_df['date'], y=weather_df['temperature']),  )
fig.update_layout(title='Temprature Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Temprature in Degrees')
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole View", step="all"),
                      dict(count=1,label="One Year View",step="year",stepmode="todate")                      
                     ])),
        rangeslider=dict(visible=True),type="date")
)
fig.show()

fig1 = px.box( weather_df[0:100], 'month', 'temperature', color= 'month')
fig1.update_layout(title='temperature vs Month after removing outliers')
fig1.show()

fig1 = px.box(weather_df[0:100], 'humidity', 'temperature')
fig1.update_layout(title='temperature vs Humidity after removing outliers')
fig1.show()

fig = px.box(weather_df[0:100], 'visibility', 'temperature')
fig.update_layout(title='temperature vs Visibility after removing outliers')
fig.show()

fig = px.box(weather_df[0:100], 'wind_bear', 'temperature')
fig.update_layout(title='temperature vs wind_bear after removing outliers')
fig.show()


print("############## Data Analysis Ends ########################")

In [None]:
arr_wea = weather_df[0:500]
plt.scatter(arr_wea.wind_bear, arr_wea.temperature)
plt.xlabel()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

#ploting the triangle heatmap for correlation for the selected features
matrix = weather_df.corr().round(2)
mask = np.triu(np.ones_like(matrix, dtype=bool))
sns.heatmap(matrix, annot=True, vmax=1, vmin=-1, center=0, cmap='vlag', mask=mask)
plt.show()

print("################# Scalarizing data ####################")
x = weather_df.drop(["temperature", "date"],axis=1)
y = weather_df["temperature"]

sc = StandardScaler().fit(x)
x  = pd.DataFrame(sc.transform(x))

print("################# Train and Test Split ####################")
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)


print("################# Fit and Predict ####################")
######################## Test Starts ########################
y_pred = []
model_output = []
ridge_model = Ridge().fit(x_train,y_train)
y_pred = ridge_model.predict(x_test)
#print("Ridge Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
model_output.append(["Ridge Regression",r2_score(y_test, y_pred)*100, mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False)])

pred = pd.DataFrame(y_pred[0:100])
plt.figure(figsize = (10, 5))
index=y_test[0:100].reset_index()["temperature"]
ax=index.plot(label="original values")
ax=pred[0].plot(label = "predicted values")
plt.legend(loc='upper right')
plt.title("Ridge : Actual vs Predicted")
plt.xlabel("Day")
plt.ylabel("temperature")
plt.show()


y_pred = []
lasso_model = Lasso().fit(x_train,y_train)
y_pred = lasso_model.predict(x_test)
#print("Lasso Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
model_output.append(["Lasso Regression",r2_score(y_test, y_pred)*100, mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False)])

pred = pd.DataFrame(y_pred[0:100])
plt.figure(figsize = (10, 5))
index=y_test[0:100].reset_index()["temperature"]
ax=index.plot(label="original values")
ax=pred[0].plot(label = "predicted values")
plt.legend(loc='upper right')
plt.title("Lasso : Actual vs Predicted")
plt.xlabel("Day")
plt.ylabel("Feels_like")
plt.show()


y_pred = []
linReg_model = LinearRegression().fit(x_train, y_train)
y_pred = linReg_model.predict(x_test)
#print("Multiple Linear Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
model_output.append(["Multiple Linear Regression", r2_score(y_test, y_pred)*100,mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False)])

pred = pd.DataFrame(y_pred[0:100])
plt.figure(figsize = (10, 5))
index=y_test[0:100].reset_index()["temperature"]
ax=index.plot(label="original values")
ax=pred[0].plot(label = "predicted values")
plt.legend(loc='upper right')
plt.title("Multiple Linear : Actual vs Predicted")
plt.xlabel("Day")
plt.ylabel("Feels_like")
plt.show()


y_pred = []
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(x_train)
lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y_train)
y_pred = lin_reg2.predict(poly_reg.fit_transform(x_test))
#print("Polynomial Regression Test R2 Score : %",r2_score(y_test, y_pred)*100)
model_output.append(["Polynomial Regression", r2_score(y_test, y_pred)*100, mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred, squared=False)])

pred = pd.DataFrame(y_pred[0:100])
plt.figure(figsize = (10, 5))
index=y_test[0:100].reset_index()["temperature"]
ax=index.plot(label="original values")
ax=pred[0].plot(label = "predicted values")
plt.legend(loc='upper right')
plt.title("Polynomial : Actual vs Predicted")
plt.xlabel("Day")
plt.ylabel("Feels_like")
plt.show()


df_columns = ["Model", "R_Squared", "MAE", "MSE"]
df_models = pd.DataFrame(model_output, columns = df_columns)
df_models

In [None]:
pred = pd.DataFrame(y_pred)
plt.figure(figsize = (10, 5))
index=y_test.reset_index()["temperature"]
ax=index.plot(label="original values")
ax=pred[0].plot(label = "predicted values")
plt.legend(loc='upper right')
plt.title("Test vs Pred")
plt.xlabel("Day")
plt.ylabel("Feels_like")
plt.show()

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
weather_df = pd.read_csv("DisneylandReviews.csv",encoding='latin-1')
weather_df.head()
