### Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

### Loading DataSet

In [3]:
df = pd.read_csv("Delhi Temerature.csv")
df_copy = df.copy()

In [4]:
print("Shape of original df ", df.shape)
print("Shape of copy df ", df_copy.shape)

Shape of original df  (100990, 20)
Shape of copy df  (100990, 20)


In [5]:
df.head()

Unnamed: 0,datetime_utc,_conds,_dewptm,_fog,_hail,_heatindexm,_hum,_precipm,_pressurem,_rain,_snow,_tempm,_thunder,_tornado,_vism,_wdird,_wdire,_wgustm,_windchillm,_wspdm
0,19961101-11:00,Smoke,9.0,0,0,,27.0,,1010.0,0,0,30.0,0,0,5.0,280.0,West,,,7.4
1,19961101-12:00,Smoke,10.0,0,0,,32.0,,-9999.0,0,0,28.0,0,0,,0.0,North,,,
2,19961101-13:00,Smoke,11.0,0,0,,44.0,,-9999.0,0,0,24.0,0,0,,0.0,North,,,
3,19961101-14:00,Smoke,10.0,0,0,,41.0,,1010.0,0,0,24.0,0,0,2.0,0.0,North,,,
4,19961101-16:00,Smoke,11.0,0,0,,47.0,,1011.0,0,0,23.0,0,0,1.2,0.0,North,,,0.0


### Droping high null values

In [6]:
null_set = df.isnull().sum()/df.shape[0]*100
null_col = null_set[null_set>20].keys()
print("Null Columns",null_col)
df_copy = df_copy.drop(columns=null_col)
df_copy = df_copy.drop("datetime_utc",axis=1)
print("Shape after removing null values",df_copy.shape)

Null Columns Index([' _heatindexm', ' _precipm', ' _wgustm', ' _windchillm'], dtype='object')
Shape after removing null values (100990, 15)


In [7]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100990 entries, 0 to 100989
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0    _conds      100918 non-null  object 
 1    _dewptm     100369 non-null  float64
 2    _fog        100990 non-null  int64  
 3    _hail       100990 non-null  int64  
 4    _hum        100233 non-null  float64
 5    _pressurem  100758 non-null  float64
 6    _rain       100990 non-null  int64  
 7    _snow       100990 non-null  int64  
 8    _tempm      100317 non-null  float64
 9    _thunder    100990 non-null  int64  
 10   _tornado    100990 non-null  int64  
 11   _vism       96562 non-null   float64
 12   _wdird      86235 non-null   float64
 13   _wdire      86235 non-null   object 
 14   _wspdm      98632 non-null   float64
dtypes: float64(7), int64(6), object(2)
memory usage: 11.6+ MB


### Fetching Dtypes value (int,float,object)

In [8]:
num_val = df_copy.select_dtypes(include=["int64","float"]).keys()
print("Columns having interger value-->",num_val,sep="\n")

Columns having interger value-->
Index([' _dewptm', ' _fog', ' _hail', ' _hum', ' _pressurem', ' _rain',
       ' _snow', ' _tempm', ' _thunder', ' _tornado', ' _vism', ' _wdird',
       ' _wspdm'],
      dtype='object')


In [9]:
char_val = df_copy.select_dtypes(include=["object"]).keys()
print("Columns having Character value-->",char_val,sep="\n")

Columns having Character value-->
Index([' _conds', ' _wdire'], dtype='object')


### Filling Null values

In [10]:
num_val = [' _dewptm', ' _fog', ' _hail', ' _hum', ' _pressurem', ' _rain',
       ' _snow',  ' _thunder', ' _tornado', ' _vism', ' _wdird',
       ' _wspdm']
char_val = [' _conds', ' _wdire']

In [11]:
mean_pipeline = Pipeline(steps=[("impute",SimpleImputer(strategy="mean"))])
median_pipeline = Pipeline(steps=[("impute",SimpleImputer(strategy="median"))])
mode_pipeline = Pipeline(steps=[("impute",SimpleImputer(strategy="most_frequent"))])

In [12]:
transformer = ColumnTransformer(transformers=[("median",mean_pipeline,num_val),
                                             ("mode",mode_pipeline,char_val)
                                             ])

In [13]:
transformer.fit(df_copy)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('median',
                                 Pipeline(memory=None,
                                          steps=[('impute',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 [' _dewptm', ' _fog', ' _hail', ' _hum',
                                  ' _pressurem', ' _rain', ' _snow',
                                  ' _thund

In [14]:
updated_values = transformer.transform(df_copy)
update_df = pd.DataFrame(updated_values,columns=num_val+char_val)
df_copy.update(update_df)
df_copy.isnull().sum()

 _conds          0
 _dewptm         0
 _fog            0
 _hail           0
 _hum            0
 _pressurem      0
 _rain           0
 _snow           0
 _tempm        673
 _thunder        0
 _tornado        0
 _vism           0
 _wdird          0
 _wdire          0
 _wspdm          0
dtype: int64

In [15]:
df_copy = df_copy.dropna(subset=[" _tempm"])
df_copy.isnull().sum().sum()

0

### Data and target

In [16]:
X = df_copy.drop(columns=[" _tempm"])
y = df_copy[" _tempm"]

### Label Encoding

In [17]:
lb = LabelEncoder()
end = np.array([lb.fit_transform(X[var]) for var in char_val])
X[' _conds'] = end[0]
X[' _wdire'] = end[1]

### Standard Scaller

In [18]:
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

### Data Splitting

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=40)

### Liner Model

In [48]:
lr = LinearRegression()
lr.fit(X_train,y_train)
print("Model Accuracy -->%.2f"%(lr.score(X_test,y_test)*100))

Model Accuracy -->91.55


### Checking Error

In [49]:
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
mse

6.123049816016044

In [50]:
rmse = np.sqrt(mse)
rmse

2.474479706123298