In [None]:
# Library Import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

# magic function is to enable the inline plotting
%matplotlib inline 

# for display all the column in the datafarmes
pd.pandas.set_option('display.max_columns',None)

In [None]:
# Read Data
df= pd.read_csv("2019.csv")
# To show first 5 rows of data 
df.head()
# To get shape of data
df.shape
# describe basic statistics of data (including cat and num)
df.describe(include='all')
# information about data frame
df.info()
# data types
df.dtypes

In [None]:
# To seperate categorical and numerical columns
features = df.columns
cat = df.select_dtypes(include= ['object','category'])
num =df.select_dtypes(exclude = ['object','category'])

In [None]:
# Unique Values
for var in features:
    if len(df[var].unique()) < 7 :
        print(var,'-->',len(df[var].unique()),':', df[var].unique())
    else :
        print(var,'-->',len(df[var].unique()))

# To get unique value count
df['features_name'].value_counts()

## Graphical Representation of Data

In [None]:
# analyze categorical Columns:

import seaborn as sns
import matplotlib.pyplot as plt


fig,axes = plt.subplots(4,2,figsize=(12,15))
for idx,cat_col in enumerate(cat.columns):
    row,col = idx//2,idx%2
    sns.countplot(x=cat_col,data=df,hue='traget_feature',ax=axes[row,col])

plt.subplots_adjust(hspace=1)

In [None]:
# To analyze Numerical Columns:

fig,axes = plt.subplots(1,3,figsize=(17,5))
for idx,cat_col in enumerate(num.columns):
    sns.boxplot(y=cat_col,data=df,x='traget_feature',ax=axes[idx])

print(df[num.columns].describe())
plt.subplots_adjust(hspace=1)

In [None]:
# Co-Relation Heatmap

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True, fmt='.1f', linewidths=.1)
plt.show()

In [None]:
# Box Plot
for var in ['features_name']:
    sns.boxplot(df[var])
    plt.show()

In [None]:
# Plot for seeing distrubution of data
for var in ['feature names']:
    fig= plt.subplots(figsize=(12, 8))
    sns.distplot(df[var])
    plt.show()

In [None]:
# Plot 
df.groupby('features_name')['traget_feature'].median().plot()
plt.xlabel('features_name')
plt.ylabel('Median features_name')
plt.title("features_name vs traget_feature")

## Missing Data

In [None]:
# Missing Completely at Random(MCAR):
""" A variable is missing completely at random (MCAR)if the missing values on a given variable (Y) don't have a relationship with other 
variables in a given data set or with the variable (Y) itself. In other words, When data is MCAR, there is no relationship between the data
missing and any values, and there is no particular reason for the missing values."""

# Missing at Random(MAR):
""" Let's understands the following examples:
Women are less likely to talk about age and weight than men.
Men are less likely to talk about salary and emotions than women.
familiar right?… This sort of missing content indicates missing at random.

MAR occurs when the missingness is not random, but there is a systematic relationship between missing values and other observed data but not
the missing data.
Let me explain to you: you are working on a dataset of ABC survey. You will find out that many emotion observations are null.
You decide to dig deeper and found most of the emotion observations are null that belongs to men's observation."""

# Missing Not at Random(MNAR):
""" The final and most difficult situation of missingness. MNAR occurs when the missingness is not random, and there is a systematic relationship
between missing value, observed value, and missing itself. To make sure, If the missingness is in 2 or more variables holding the same pattern,
you can sort the data with one variable and visualize it."""



In [None]:
# Detecting missing data
mis_val =df.isna().sum()
mis_val_per = df.isna().sum()/len(df)*100
mis_val_table = pd.concat([mis_val, mis_val_per], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[
       mis_val_table_ren_columns.iloc[:,:] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
mis_val_table_ren_columns

In [None]:
# To get column name which has null value
na_features = [var for var in df.columns if df[var].isnull().sum()>0]

## Finding reason for missing data using plot

In [None]:
# For Graphical view of missing data
"""The msno.matrix() is a nullity matrix that will help to visualize the location of the null observations."""

import missingno as msno
msno.matrix(df)

# To sort graph using one value
"""The missingno package additionally lets us sort the chart by a selective column. Let's sort the value by one feature name 
column to detect if there is a pattern in the missing values."""

sorted = df.sort_values('feature_name')
msno.matrix(sorted)

# Heatmap for Missing Value
"""msno. heatmap() helps to visualize the correlation between missing features. 
The missingno correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the 
presence of another

Nullity correlation ranges from -1 (if one variable appears the other definitely does not) to 0 (variables appearing or not 
appearing have no effect on one another) to 1 (if one variable appears the other definitely also does)"""

msno.heatmap(df)

# Dendrogram for missing value

msno.dendrogram(df)

## Treating Missing values

In [None]:
# Complete Case Analysis(CCA):
"""This is a quite straightforward method of handling the Missing Data, which directly removes the rows that have missing data 
i.e we consider only those rows where we have complete data i.e data is not missing. This method is also 
popularly known as “Listwise deletion”.

When to Use:-
> Data is MAR(Missing At Random).
> Good for Mixed, Numerical, and Categorical data.
> Missing data is not more than 5% - 6% of the dataset.
> Data doesn't contain much information and will not bias the dataset.
"""

df.dropna(subset=['fature_name'],how='any',axis = 0) # Drop rows which contains any NaN or missing value for feature_name column & for complete df remove subset and how

## Imputations Techniques for non Time Series Problems:

# Arbitrary Value Imputation
"""This is an important technique used in Imputation as it can handle both the Numerical and Categorical variables. 
This technique states that we group the missing values in a column and assign them to a new value that is far away from the range of that column.
Mostly we use values like 99999999 or -9999999 or “Missing” or “Not defined” for numerical & categorical variables.

When to Use:-
> When data is not MAR(Missing At Random).
> Suitable for All.
"""

df['features_name'].fillna('Missing')


# Frequent Category Imputation
""""This technique says to replace the missing value with the variable with the highest frequency or in simple words replacing the values 
with the Mode of that column. This technique is also referred to as Mode

When to Use:-
> Data is Missing at Random(MAR)
> Missing data is not more than 5% - 6% of the dataset.
"""

df['features_name'].fillna(df['features_name'].mode())

## Imputations Techniques for Time Series Problems:

# Imputing using ffill

df.fillna(method='ffill')

# Imputation using bfill

df.fillna(method='bfill')

# Imputation using Linear Interpolation method

"""Linear interpolation is an imputation technique that assumes a linear relationship between data points and utilises non-missing values
 from adjacent data points to compute a value for a missing data point."""

df.interpolate(limit_direction="both")

## Advanced Imputation Techniques:

# Imputation Using k-NN
"""The k nearest neighbours is an algorithm that is used for simple classification. The algorithm uses 'feature similarity' to predict 
the values of any new data points. This means that the new point is assigned a value based on how closely it resembles the points in the
training set. This can be very useful in making predictions about the missing values by finding the k's closest neighbours to the 
observation with missing data and then imputing them based on the non-missing values in the neighbourhood.

The fundamental weakness of KNN doesn't work on categorical features. We need to convert them into numeric using any encoding method. 
It requires normalizing data as KNN Imputer is a distance-based imputation method and different scales of data generate biased replacements 
for the missing values.
"""

from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
df['feature_name'] = knn_imputer.fit_transform(df[['feature_name']])
df = pd.DataFrame(knn_imputer.fit_transform(df),columns = df.columns) # for whole dataset

# Imputation Using Multivariate Imputation by Chained Equation(MICE)
"""This type of imputation works by filling the missing data multiple times. Multiple Imputations (MIs) are much better than a single 
imputation as it measures the uncertainty of the missing values in a better way. The chained equations approach is also very flexible and can
 handle different variables of different data types (ie., continuous or binary) as well as complexities such as bounds or survey skip patterns. 
"""

from impyute.imputation.cs import mice
imputed_training=mice(df.values)

# In sklearn, it is implemented as follows

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
mice_imputer = IterativeImputer()
df['feature_name'] = mice_imputer.fit_transform(df[['feature_name']])

# Stochastic regression imputation:
"""It is quite similar to regression imputation which tries to predict the missing values by regressing it from other related variables
 in the same dataset plus some random residual value."""

# Extrapolation and Interpolation:
"""It tries to estimate values from other observations within the range of a discrete set of known data points."""

# Hot-Deck imputation:
"""Works by randomly choosing the missing value from a set of related and similar variables."""

# Cold-Deck Imputation: 
"""A systematically chosen value from an individual who has similar values on other variables.This is similar to Hot Deck in most ways,
 but removes the random variation."""

 # Replacing Values using backward & forward filling 

df.fillna(method = 'bfill', axis=0)

## Outlier Detection and Treatment

In [None]:
# Box plot outlier detection

for var in 'feature name list':
    sns.boxplot(df[var])
    plt.show()

In [None]:
# Outlier Treatment

# DELETING OBSERVATIONS:
"""We delete outlier values if it is due to data entry error, data processing error or outlier observations are very small in numbers. 
We can also use trimming at both ends to remove outliers. But deleting the observation is not a good idea when we have small dataset."""

for var in 'list of feature name' :
    Q1 = df[var].quantile(0.25)
    Q3 = df[var].quantile(0.75)
    IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


# TRANSFORMING VALUES:

# Scalling
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
df['feature_name'] = scaler.fit_transform(df['feature_name'].values.reshape(-1,1))

# Log Transformation
df['feature_name'] = np.log(df['feature_name'])

# Box-transformation
import scipy
df['feature_name'],fitted_lambda= scipy.stats.boxcox(df['feature_name'] ,lmbda=None)

# IMPUTATION
q1 = df['feature_name'].quantile(0.25)
q3 = df['feature_name'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr
m = np.mean(df['feature_name']) # Mean,median,mode
for i in df['feature_name']:
    if i > Upper_tail or i < Lower_tail:
            df['feature_name'] = df['feature_name'].replace(i, m)

# Binning
""" Binning the data and categorizing them will totally avoid the outliers. It will make the data categorical instead."""

df['feature_name'] = pd.cut(df['feature_name'], bins = [0, 10, 20, 30, 40, 55], labels = ['Very Low', 'Low', 'Average', 'High', 'Very High'])

# Quantile based flooring & capping

q10 = df['feature_name'].quantile(0.10)
q90 = df['feature_name'].quantile(0.90)
df["feature_name"] = np.where(df["feature_name"] <q10, q10,df['feature_name'])
df["feature_name"] = np.where(df["feature_name"] >q90, q90,df['feature_name'])


# Winsorizing
"""Unlike trimming, here we replace the outliers with other values. Common is replacing the outliers on the upper side with 95% percentile
 value and outlier on the lower side with 5% percentile."""

import scipy.stats
scipy.stats.mstats.winsorize(df['feature_name'],limits=0.05)

## Feature Transformation

In [None]:
# Label encoding using map function
feature_name_dict = {'N':0, 'Y':1}
df['feature_name'] = df['feature_name'].map(feature_name_dict)

# Label Encoding
from sklearn.preprocessing import LabelEncoder
feature_col = "list of feature names"
le = LabelEncoder()
for col in feature_col:
    df[col] = le.fit_transform(df[col])

# One Hot encoding
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(sparse=False,drop = 'if_binary')
transformed_data = onehotencoder.fit_transform(df[['feature_name']])
# the above transformed_data is an array so convert it to dataframe and add feature name to the comlum
encoded_data = pd.DataFrame(transformed_data, columns=onehotencoder.get_feature_names_out())
# now concatenate the original data and the encoded data using pandas
df = pd.concat([df, encoded_data], axis=1).drop('feature_name', axis=1)

In [None]:
# standard Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
na_features = ['feature names']
for var in na_features:
    df[var] = scaler.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = scaler.transform(df_test[var].values.reshape(-1, 1))

# maximum absolute scaling
from sklearn.preprocessing import MaxAbsScaler
max_scaler = MaxAbsScaler()
for var in na_features:
    df[var] = max_scaler.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = max_scaler.transform(df_test[var].values.reshape(-1, 1))  

#  min-max feature scaling
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
for var in na_features:
    df[var] = mm_scaler.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = mm_scaler.transform(df_test[var].values.reshape(-1, 1))  

# log scaling  
from sklearn.preprocessing import FunctionTransformer
ln = FunctionTransformer(np.log1p)
for var in na_features:
    df[var] = ln.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = ln.transform(df_test[var].values.reshape(-1, 1))

# Normalize
from sklearn.preprocessing import Normalizer
norm = Normalizer()
for var in na_features:
    df[var] = norm.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = norm.transform(df_test[var].values.reshape(-1, 1))

# RobustScaler
from sklearn.preprocessing import RobustScaler
robu = RobustScaler()
for var in na_features:
    df[var] = robu.fit_transform(df[var].values.reshape(-1, 1))
    df_test[var] = robu.transform(df_test[var].values.reshape(-1, 1))


## Generlized Model 

In [None]:
# Train test Split

from sklearn.model_selection import train_test_split
X = df.drop(['Traget_feature'], axis=1)
y = df[['Traget_feature']]
X_test = df_test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

## Regression

In [None]:
# LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

lr=linear_model.LinearRegression()
lm_model=lr.fit(X_train,y_train)
y_pred=lm_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val,y_pred))

# XGBRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

xgb_reg = XGBRegressor(n_estimators=400, max_depth = 20, learning_rate = 0.05,  min_child_weight  = 40)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_pred, y_val))
print (xgb_reg)

# RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

random_forest_regressor = RandomForestRegressor(n_estimators = 10, random_state = 27)
random_forest_regressor.fit(X_train, y_train)
Y_pred = random_forest_regressor.predict(X_val)
mse = mean_squared_error(y_val, Y_pred)
rmse = np.sqrt(mse)

# DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_model = DecisionTreeRegressor(random_state=1)
tree_model.fit(X_train, y_train)
Y_pred = tree_model.predict(X_val)
mse = mean_squared_error(y_val, Y_pred)
rmse = np.sqrt(mse)

## Classification

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logistic_model = LogisticRegression(random_state=1)
lg_model = logistic_model.fit(X_train,y_train)
y_pred_logistic=lg_model.predict(X_val)
score_logistic =accuracy_score(y_pred_logistic,y_val)*100

# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(random_state=1)
tree_model.fit(X_train,y_train)
pred_cv_tree=tree_model.predict(X_val)
score_tree =accuracy_score(pred_cv_tree,y_val)*100 

# RandomForestClassifier with grid_search
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
paramgrid = {'max_depth': list(range(1,20,2)),'n_estimators':list(range(1,200,20))}
grid_search = GridSearchCV(RandomForestClassifier(random_state=1),paramgrid)
grid_search.fit(X_train,y_train)
grid_search.best_estimator_

forest_model = RandomForestClassifier(random_state=1,max_depth=10,n_estimators=50) # add values from grid_search.best_estimator_
forest_model.fit(X_train,y_train)
pred_cv_forest=forest_model.predict(X_val)
score_forest = accuracy_score(pred_cv_forest,y_val)*100

# XGBClassifier
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=50,max_depth=4) 
xgb_model.fit(X_train,y_train)
pred_xgb=xgb_model.predict(X_val)
score_xgb = accuracy_score(pred_xgb,y_val)*100

## Feature Importance

In [None]:
# Function to plot feature importance
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

import matplotlib.pyplot as plt
%matplotlib inline
plot_features(model_name, (10,14))

## .....

In [None]:
# Library for saving all model performance in one place
from collections import OrderedDict
model_performance = OrderedDict()

# This line for all model performance
model_performance['Model name'] = round(rmse,3)
print(f'Root Mean Squared Error of the model is : {round(rmse,3)}')

# For showing model performance
model_performance

In [None]:
# To save data into other 
testRes = df[['feature_name']]
testRes['feature_name'] = df['feature_name']
testRes

# To predict and save values in columns
yPreds = model_name.predict(X_test)
testRes['target_name'] = yPreds
submission = testRes[['feature_name & Traget_name to be saved in file']]

# To SAve submission file
submission.columns = ['feature_name & Traget_name to be saved in file']
submission.to_csv('submission.csv', index = False)
submission.head()

# Time Series Problems

## Data Prepration

In [None]:
# Date time dtype conversion
df['Datetime']=pd.to_datetime(df.Datetime, format='%d-%m-%Y %H:%M')

In [None]:
# Creating Time Based Feature.This helps regression models to understand the trend in the data.

# Below function extracts date related features from datetime
def create_date_featues(df):
    df['Year'] = pd.to_datetime(df['DateTime']).dt.year
    df['Month'] = pd.to_datetime(df['DateTime']).dt.month
    df['Day'] = pd.to_datetime(df['DateTime']).dt.day
    df['Dayofweek'] = pd.to_datetime(df['DateTime']).dt.dayofweek
    df['DayOfyear'] = pd.to_datetime(df['DateTime']).dt.dayofyear
    df['Week'] = pd.to_datetime(df['DateTime']).dt.week
    df['Quarter'] = pd.to_datetime(df['DateTime']).dt.quarter 
    df['Is_month_start'] = pd.to_datetime(df['DateTime']).dt.is_month_start
    df['Is_month_end'] = pd.to_datetime(df['DateTime']).dt.is_month_end
    df['Is_quarter_start'] = pd.to_datetime(df['DateTime']).dt.is_quarter_start
    df['Is_quarter_end'] = pd.to_datetime(df['DateTime']).dt.is_quarter_end
    df['Is_year_start'] = pd.to_datetime(df['DateTime']).dt.is_year_start
    df['Is_year_end'] = pd.to_datetime(df['DateTime']).dt.is_year_end
    df['Semester'] = np.where(df['Quarter'].isin([1,2]),1,2)
    df['Is_weekend'] = np.where(df['Dayofweek'].isin([5,6]),1,0)
    df['Is_weekday'] = np.where(df['Dayofweek'].isin([0,1,2,3,4]),1,0)
    df['Hour'] = pd.to_datetime(df['DateTime']).dt.hour
    
    return df

# extracting time related 
df=create_date_featues(df)

In [None]:
# To convert datetime col into index
indexed_df = df.set_index('Datetime')

## Examine the data

In [None]:
# show plots in the notebook
%matplotlib inline
indexed_df['traget_feature'].plot(figsize=(12,8));

In [None]:
df.loc[:,['feature_name','traget_feature']].plot(x='feature_name',y='traget_feature',title='traget_feature Trend',figsize=(16,6))

## Train and test split
Timeseries problems requires time based validation instead of generaly used kfold validation in regression problem. Kfold splits the data randomly and checking the model accuracy by predicting on timeperiod 2016 by using 2017 data makes no sense.
insted we use time based validation for the time period (2017-01-01 to 2017-04-01) of 4 months, since the test set contains 4 months data to predict.


In [None]:
train1=indexed_df[ :'2014-02-25 23:00:00']#Train period from 2016-01-01 to 2017-02-31
val1=indexed_df['2014-02-25 23:00:00': ] #Month 3,4,5,6 as validtaion period

20 aug:
7pm:logistic
8pm:pca
8.45:time_series
24 aug:
8pm:SQL-ETL
25 aug:
8pm:SQL-ETL
