In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print("setup completed")

# Import file path 

In [None]:
filepath = '../input/weather-dataset-rattle-package/weatherAUS.csv'

# Convert into DataFrame

In [None]:
df = pd.read_csv(filepath)
print(df.shape)
df

In [None]:
df.keys()

In [None]:
df.info()

In [None]:
df.describe()

# Finding numerical and categorical columns

In [None]:
object_col = []
numeric_val=[]

for rain in df.columns:
    
    if df[rain].dtype == 'object':
        object_col.append(rain)
    else:
        numeric_val.append(rain)
        
print(object_col)
print(numeric_val)

# Finding unique values of columns

In [None]:
df.nunique()

In [None]:
col  = df.columns
col
for column in col:
    print(column,'--> ', df[column].unique())

# Finding null values of columns

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
plt.figure(figsize=(30,30))
sns.heatmap(df.isnull(),cbar=False,cmap='Dark2')
plt.title("Heatmap of the with nan values",fontsize=30)
plt.xlabel("Columns",fontsize=30)
plt.ylabel("Rows",fontsize=30)
plt.show()

# Handling missing values using mean,mode

In [None]:
df['RainToday'] = df['RainToday'].map({'Yes': 1, 'No' : 0})
df.RainToday

In [None]:
df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No' : 0})
df.RainToday

# Filling missing values with mean (numerical columns)

In [None]:
df['MinTemp']=df['MinTemp'].fillna(df['MinTemp'].mean())
df['MaxTemp']=df['MaxTemp'].fillna(df['MaxTemp'].mean())
df['Rainfall']=df['Rainfall'].fillna(df['Rainfall'].mean())
df['Evaporation']=df['Evaporation'].fillna(df['Evaporation'].mean())
df['Sunshine']=df['Sunshine'].fillna(df['Sunshine'].mean())
df['WindGustSpeed']=df['WindGustSpeed'].fillna(df['WindGustSpeed'].mean())
df['WindSpeed9am']=df['WindSpeed9am'].fillna(df['WindSpeed9am'].mean())
df['WindSpeed3pm']=df['WindSpeed3pm'].fillna(df['WindSpeed3pm'].mean())
df['Humidity9am']=df['Humidity9am'].fillna(df['Humidity9am'].mean())
df['Humidity3pm']=df['Humidity3pm'].fillna(df['Humidity3pm'].mean())
df['Pressure9am']=df['Pressure9am'].fillna(df['Pressure9am'].mean())
df['Pressure3pm']=df['Pressure3pm'].fillna(df['Pressure3pm'].mean())
df['Cloud9am']=df['Cloud9am'].fillna(df['Cloud9am'].mean())
df['Cloud3pm']=df['Cloud3pm'].fillna(df['Cloud3pm'].mean())
df['Temp9am']=df['Temp9am'].fillna(df['Temp9am'].mean())
df['Temp3pm']=df['Temp3pm'].fillna(df['Temp3pm'].mean())

# Filling missing values with mode(categorical and continuous columns)

In [None]:
df['RainToday']=df['RainToday'].fillna(df['RainToday'].mode()[0])
df['RainTomorrow']=df['RainTomorrow'].fillna(df['RainTomorrow'].mode()[0])

df['WindDir9am'] = df['WindDir9am'].fillna(df['WindDir9am'].mode()[0])
df['WindGustDir'] = df['WindGustDir'].fillna(df['WindGustDir'].mode()[0])
df['WindDir3pm'] = df['WindDir3pm'].fillna(df['WindDir3pm'].mode()[0])

# Cheaking if there have any null value left

In [None]:
plt.figure(figsize=(30,30))
sns.heatmap(df.isnull(),cbar=False,cmap='Dark2')
plt.title("Heatmap of the without nan values",fontsize=30)
plt.xlabel("Columns",fontsize=30)
plt.ylabel("Rows",fontsize=30)
plt.show()

In [None]:
miss_val_per = df.isnull().mean()*100
miss_val_per

In [None]:
df.isnull().sum().sum()

# `Data Visualization for the dataset

In [None]:
plt.figure(figsize=(16,9))
plt.subplot(1,2,1)
sns.countplot(x  = 'RainToday', hue = 'RainToday', data = df )
print(df.RainToday.value_counts())

plt.subplot(1,2,2)
sns.countplot(x  = 'RainTomorrow', hue = 'RainTomorrow', data = df )
print(df.RainTomorrow.value_counts())

In [None]:
plt.figure(figsize=(16,10))
plt.subplot(2,2,1)
sns.countplot(x  = 'WindDir9am', data = df )
plt.title("Wind Direction at 9 am")
plt.subplot(2,2,2)
sns.countplot(x  = 'WindDir3pm', data = df )
plt.title("Wind Direction at 3 pm")
plt.subplot(2,2,3)
sns.countplot(x  = 'WindGustDir', data = df )
plt.title("WindGustDirection")

print('At 9 am, it is highest for direction N')
print('At 3 pm, it is highest for direction SE')

# Dropping date column

In [None]:
df.drop('Date',axis = 1,inplace=True)
df

# Encoding the categorical variables

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Location'] = le.fit_transform(df['Location'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])

In [None]:
df.head(20)

# Checking correlation

In [None]:
plt.figure(figsize=(25,25))
ax = sns.heatmap(df.corr(),linewidths=1, square=True, annot=True, fmt='.2f')
ax.set_xticklabels(ax.get_xticklabels())
plt.title('Correlation of of the DataFrame',fontsize = 20)
plt.show()

In [None]:
print('Temp3pm and Temp9am highly correlated')
print('Humidity9am and Humidity3pm highly correlated')
print('MaxTemp and Temp9am highly correlated')
print('MaxTemp and Temp3pm highly correlated')
print('MinTemp and Temp9am highly correlated')
print('MinTemp and Temp3pm highly correlated')

# Boxplot and Bivariate Analysis



In [None]:
import warnings
warnings.filterwarnings("ignore")
plt.figure(figsize=(16,16))
plt.subplot(3,2,1)
sns.boxplot(df['Humidity3pm'],orient='v',color='r')
plt.subplot(3,2,2)
sns.boxplot(df['Humidity9am'],orient='v',color='r')
plt.subplot(3,2,3)
sns.boxplot(df['Pressure3pm'],orient='v',color='c')
plt.subplot(3,2,4)
sns.boxplot(df['Pressure9am'],orient='v',color='c')
plt.subplot(3,2,5)
sns.violinplot(x='RainToday',y='MaxTemp',data=df,hue='RainTomorrow')
plt.subplot(3,2,6)
sns.violinplot(x='RainToday',y='MinTemp',data=df,hue='RainTomorrow')
plt.show()

# Removing Outliers

In [None]:
from scipy import stats
print('Shape of DataFrame Before Removing Outliers', df.shape )
df=df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
print('Shape of DataFrame After Removing Outliers', df.shape )

# Dropping highly correlated columns

In [None]:
df=df.drop(['Temp3pm','Temp9am','Humidity9am'],axis=1)
df.columns

# Balancing data for training and testing

In [None]:
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
os = SMOTE()
X, y = os.fit_resample(df.iloc[:,:-1], df.iloc[:,-1])
count = Counter(y)
print(count)

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

# Applying Machine Learing Algorithm

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier

# Suppor vector Classifier

In [None]:
from sklearn.svm import SVC
svc_classifier = SVC()
svc_classifier.fit(X_train, y_train)
y_pred_scv = svc_classifier.predict(X_test)
accuracy_score(y_test, y_pred_scv)

# Random Forest Regressor

In [None]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 42)  
rfr.fit(X_train, y_train)  
y_pred_rfr = rfr.predict(X_test)
accuracy_score(y_test, y_pred_rfr.round())

# XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
accuracy_score(y_test, y_pred_xgb)

# K – Nearest Neighbor Classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred_knn)

# Gaussian Naive Bayes

In [None]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
accuracy_score(y_test, y_pred_nb)

#  Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 51)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
accuracy_score(y_test, y_pred_dt)

# AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb_classifier = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', random_state = 200),
                                    n_estimators=2000,
                                    learning_rate=0.1,
                                    algorithm='SAMME.R',
                                    random_state=1,)
adb_classifier.fit(X_train, y_train)
y_pred_adb = adb_classifier.predict(X_test)
accuracy_score(y_test, y_pred_adb)

# Result

In [None]:
print(f"RandomForestRegressor {0.9068931345209836*100} accuracy score ")
print(f"XGBClassifier {0.9065704869673434*100} accuracy score ")
print(f"svc_classifier {0.7643290083196976*100} accuracy score ")
print(f"KNeighborsClassifier {0.859740499181858*100} accuracy score ")
print(f"GaussianNB {0.7450162476089511*100} accuracy score ")
print(f"DecisionTreeClassifier {0.8552003871770644*100} accuracy score ")
print(f"AdaBoostClassifier {0.917448318775783*100} accuracy score ")

In [None]:
print("AdaBoostClassifier is the best nodel for this dataset with 91%")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_adb))

In [None]:
print("Thank You !!! Keep supporting")