In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [46]:
dataset = pd.read_csv('weatherAUS.csv')

In [47]:
dataset.shape

(145460, 23)

In [48]:
# lets check column type of ds
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [49]:
dataset.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [50]:
dataset.drop(columns=['Date'], inplace=True)


In [51]:
## target me null nhi chahiye to wo row remove
dataset.dropna(subset = ['RainTomorrow'] , inplace = True)

In [52]:
dataset.isnull().sum()


Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
RainTomorrow         0
dtype: int64

In [53]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [54]:
#1 :- Handdling missing value
## make least of feature which has atleast one missing value
feature_with_na = [ feature for feature in dataset.columns if dataset[feature].isnull().sum()>=1]

In [55]:
feature_with_na

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday']

In [56]:
# now lets check ki kis feature m kitne % nan hai
for feature in feature_with_na:
    print(feature , np.round(dataset[feature].isnull().mean() ,4)*100  ,'missing values')

MinTemp 0.44999999999999996 missing values
MaxTemp 0.22999999999999998 missing values
Rainfall 0.9900000000000001 missing values
Evaporation 42.79 missing values
Sunshine 47.69 missing values
WindGustDir 6.5600000000000005 missing values
WindGustSpeed 6.52 missing values
WindDir9am 7.04 missing values
WindDir3pm 2.6599999999999997 missing values
WindSpeed9am 0.95 missing values
WindSpeed3pm 1.8499999999999999 missing values
Humidity9am 1.25 missing values
Humidity3pm 2.54 missing values
Pressure9am 9.86 missing values
Pressure3pm 9.83 missing values
Cloud9am 37.74 missing values
Cloud3pm 40.150000000000006 missing values
Temp9am 0.64 missing values
Temp3pm 1.92 missing values
RainToday 0.9900000000000001 missing values


In [57]:
numerical_features = dataset.select_dtypes(include=[np.number]).columns

In [58]:
numerical_features

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm'],
      dtype='object')

In [59]:
# no relation ig so remove na

In [60]:
# 1> replacing categorical column nan value
feature_nan_categorical = [ feature for feature in dataset.columns if dataset[feature] .isnull().sum()>1 and dataset[feature].dtypes == 'object']

In [61]:
feature_nan_categorical

['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [62]:
categorical_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']

In [63]:
dataset[categorical_cols].isnull().sum()

Location            0
WindGustDir      9330
WindDir9am      10013
WindDir3pm       3778
RainToday        1406
RainTomorrow        0
dtype: int64

In [64]:
# now replace nan with label like missing function bna ke
def replace_cat_nan(dataset , feature_nan_categorical ):
    data = dataset.copy()
    data[feature_nan_categorical] = data[feature_nan_categorical].fillna('Missing')
    return data

In [65]:
dataset = replace_cat_nan(dataset ,feature_nan_categorical )

In [66]:
dataset['WindGustDir'].isnull().sum()

0

In [67]:
#  now remove nan in numerical feature 

In [68]:
feature_nan_num = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes !='object']

In [69]:
feature_nan_num

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm']

In [70]:
# now jahan jahan null h use median se bhar do

In [71]:
median_values = dataset[feature_nan_num].median()

In [72]:
median_values

MinTemp            12.0
MaxTemp            22.6
Rainfall            0.0
Evaporation         4.8
Sunshine            8.5
WindGustSpeed      39.0
WindSpeed9am       13.0
WindSpeed3pm       19.0
Humidity9am        70.0
Humidity3pm        52.0
Pressure9am      1017.6
Pressure3pm      1015.2
Cloud9am            5.0
Cloud3pm            5.0
Temp9am            16.7
Temp3pm            21.1
dtype: float64

In [73]:
for feature in feature_nan_num:
    median_value = dataset[feature].median()
    dataset[feature].fillna(median_value ,inplace=True )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(median_value ,inplace=True )
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[feature].fillna(median_value ,inplace=True )
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [74]:
dataset['MinTemp'].isnull().sum()

0

In [75]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,4.8,8.5,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,5.0,16.9,21.8,No,No
1,Albury,7.4,25.1,0.0,4.8,8.5,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,5.0,5.0,17.2,24.3,No,No
2,Albury,12.9,25.7,0.0,4.8,8.5,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,5.0,2.0,21.0,23.2,No,No
3,Albury,9.2,28.0,0.0,4.8,8.5,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,5.0,5.0,18.1,26.5,No,No
4,Albury,17.5,32.3,1.0,4.8,8.5,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [76]:
dataset.dropna(subset = ['RainTomorrow'] , inplace = True)

In [77]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,4.8,8.5,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,5.0,16.9,21.8,No,No
1,Albury,7.4,25.1,0.0,4.8,8.5,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,5.0,5.0,17.2,24.3,No,No
2,Albury,12.9,25.7,0.0,4.8,8.5,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,5.0,2.0,21.0,23.2,No,No
3,Albury,9.2,28.0,0.0,4.8,8.5,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,5.0,5.0,18.1,26.5,No,No
4,Albury,17.5,32.3,1.0,4.8,8.5,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [78]:
def convert_to_num(dataset):
    return [1 if data == 'Yes' else 0 for data in dataset]

In [79]:
dataset['RainTomorrow'] = convert_to_num(dataset['RainTomorrow'])
dataset
    

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,4.8,8.5,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,5.0,16.9,21.8,No,0
1,Albury,7.4,25.1,0.0,4.8,8.5,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,5.0,5.0,17.2,24.3,No,0
2,Albury,12.9,25.7,0.0,4.8,8.5,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,5.0,2.0,21.0,23.2,No,0
3,Albury,9.2,28.0,0.0,4.8,8.5,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,5.0,5.0,18.1,26.5,No,0
4,Albury,17.5,32.3,1.0,4.8,8.5,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,Uluru,3.5,21.8,0.0,4.8,8.5,E,31.0,ESE,E,...,59.0,27.0,1024.7,1021.2,5.0,5.0,9.4,20.9,No,0
145455,Uluru,2.8,23.4,0.0,4.8,8.5,E,31.0,SE,ENE,...,51.0,24.0,1024.6,1020.3,5.0,5.0,10.1,22.4,No,0
145456,Uluru,3.6,25.3,0.0,4.8,8.5,NNW,22.0,SE,N,...,56.0,21.0,1023.5,1019.1,5.0,5.0,10.9,24.5,No,0
145457,Uluru,5.4,26.9,0.0,4.8,8.5,N,37.0,SE,WNW,...,53.0,24.0,1021.0,1016.8,5.0,5.0,12.5,26.1,No,0


In [80]:
from sklearn.model_selection import train_test_split

In [81]:
X_train,X_test,y_train,y_test = train_test_split(dataset.drop(columns = ['RainTomorrow']),dataset['RainTomorrow'],test_size=0.2)

In [82]:
#######one hot encoding krte
#windgustdir , location , winddir9am,winddir3pm , rain today

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
transformer = ColumnTransformer(transformers=[
    ('trf1' , OneHotEncoder(sparse_output = False , drop = 'first') , ['Location' , 'WindGustDir' ,'WindDir9am' ,'WindDir3pm' ,'RainToday']
    )
],remainder = 'passthrough')


In [83]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [84]:
from sklearn.model_selection import GridSearchCV


In [85]:
dataset.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,4.8,8.5,W,44.0,W,WNW,...,71.0,22.0,1007.7,1007.1,8.0,5.0,16.9,21.8,No,0
1,Albury,7.4,25.1,0.0,4.8,8.5,WNW,44.0,NNW,WSW,...,44.0,25.0,1010.6,1007.8,5.0,5.0,17.2,24.3,No,0
2,Albury,12.9,25.7,0.0,4.8,8.5,WSW,46.0,W,WSW,...,38.0,30.0,1007.6,1008.7,5.0,2.0,21.0,23.2,No,0
3,Albury,9.2,28.0,0.0,4.8,8.5,NE,24.0,SE,E,...,45.0,16.0,1017.6,1012.8,5.0,5.0,18.1,26.5,No,0
4,Albury,17.5,32.3,1.0,4.8,8.5,W,41.0,ENE,NW,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0


In [86]:
X_train.shape

(113754, 114)

In [87]:
from sklearn.tree import DecisionTreeClassifier

In [88]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score



# Train the Decision Tree Classifier
clf = DecisionTreeClassifier()
from sklearn.model_selection import GridSearchCV
parameters = {'criterion':["gini", "entropy", "log_loss"],
    'splitter':["best", "random"],
    'max_depth':[1,2,3,4,5,6,7,8,9,10,None] }

cv = GridSearchCV(clf,parameters,scoring='accuracy',cv=5)

cv.fit(X_train,y_train)
predict = cv.predict(X_test)
score=accuracy_score(y_test , y_pred)
score




NameError: name 'y_pred' is not defined

In [None]:
X_train

In [None]:
y_train

In [None]:
y_train

In [None]:
dataset.head()

In [None]:
from sklearn.tree import plot_tree

In [None]:

from matplotlib.pylab import rcParams

rcParams['figure.figsize'] = 80,50
plt.figure(figsize=(20, 10))
plot_tree(cv, filled=True)
plt.show()

plot_tree(cv)