In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.ignore = True

In [None]:
from sklearn.datasets import load_boston

In [None]:
l = load_boston()

In [None]:
print(l.DESCR)

In [None]:
df = pd.DataFrame(l.data, columns=l.feature_names)

In [None]:
df['MEDV'] = l.target

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
round(df.describe(),2).T

In [None]:
plt.plot(df['CRIM'])

In [None]:
sns.countplot(df['CHAS'])

In [None]:
df['CHAS'].value_counts()

In [None]:
plt.scatter(df['RM'],df['MEDV'])
plt.xlabel('Rooms')
plt.ylabel('PRICE')

In [None]:
plt.scatter(df['RAD'],df['MEDV'])
plt.xlabel('HIGHWAY')
plt.ylabel('PRICE')

In [None]:
# Check the distribution of the target variable

In [None]:
sns.boxplot(y=df['MEDV'])

In [None]:
sns.distplot(df['MEDV'])

In [None]:
# Assumption 1 --> Make sure all the x variables have a linear relationship with y variable

In [None]:
round(df.corr(),2)

In [None]:
plt.scatter(df['LSTAT'],df['MEDV'])

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True)

In [None]:
x = df.iloc[:,:13]

In [None]:
y = df['MEDV']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=2)

In [None]:
x_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
model = lr.fit(x_train,y_train)

In [None]:
model

In [None]:
model.coef_[8]

In [None]:
x_train.columns

In [None]:
model.coef_

In [None]:
model.coef_[7]

In [None]:
df.head()

In [None]:
dir(model)

In [None]:
model.fit_intercept

In [None]:
model.intercept_

In [None]:
p = model.intercept_+(model.coef_[0]*5.82115)+(model.coef_[1]*0.00000)+(model.coef_[2]*18.10000)+(model.coef_[3]*0.00000)+(model.coef_[4]*0.71300)+(model.coef_[5]*6.51300)+(model.coef_[6]*89.90000)+(model.coef_[7]*2.80160)+(model.coef_[8]*24.00000)+(model.coef_[9]*666.00000)+(model.coef_[10]*20.20000)+(model.coef_[11]*393.82000)+(model.coef_[12]*10.29000)

In [None]:
pred_values = model.predict(x_test)

In [None]:
x_test.iloc[0]

In [None]:
y_test.iloc[0]

In [None]:
pred_values

In [None]:
from statsmodels import api as sm 

In [None]:
model_stats = sm.OLS(y_train,x_train)

In [None]:
model_stats= model_stats.fit()

In [None]:
model_stats.summary()

In [None]:
df.columns

In [None]:
# Forward propagation --> Add one column at a time and check for the p value
# Back propagation --> Update the model with all the variables and remove the ones with larger p-value
# Step wise --> Akaike information criteria

In [None]:
model_updated = sm.OLS(y_train,x_train.drop(['INDUS','NOX','AGE'], axis=1))

In [None]:
model_updated=model_updated.fit()

In [None]:
model_updated.summary()

In [None]:
model_updated.fittedvalues[:5]

In [None]:
y_train[:5]

In [None]:
model_updated.resid[:5]

In [None]:
np.sqrt(mean_squared_error(y_train,model_updated.predict(x_train.drop(['INDUS','NOX','AGE'], axis=1))))

In [None]:
# Assumption 2 --> Avoid high multi collinearity
# VIF  --> Variance Inflation factor, if it is higher than 10 generally variable seems to be 
# highly correlated

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(x.corr(),annot=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
[variance_inflation_factor(x.values,i) for i in range (len(x.columns))]

In [None]:
x.columns

In [None]:
# Assumption 3 --> Make sure the error of the model doesn't follow a pattern or it follows 
# normal distribution

In [None]:
sns.distplot(model_updated.resid)

In [None]:
sns.boxplot(y=model_updated.resid)

In [None]:
pred_price = model_updated.predict(x_test.drop(['INDUS','NOX','AGE'], axis=1))

In [None]:
pred_price[:5]

In [None]:
y_test[:5]

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
rmse=np.sqrt(mean_squared_error(y_test,pred_price))

In [None]:
rmse

In [None]:
mean_absolute_error(y_test,pred_price)

In [None]:
error_df = pd.DataFrame({'Actual':y_test,'Predicted':pred_price})

In [None]:
error_df=error_df.reset_index(drop=True)

In [None]:
plt.plot(error_df)

In [None]:
sns.boxplot(y=y_test)

In [None]:
sns.boxplot(y=df['MEDV'])

In [None]:
# R squared (Coefficient of determination)
# Adjusted R squared --> Penalise r2 upon adding each variables

In [None]:
# Handling outliers

# Remove outliers
# Change the values
# Add more information

In [None]:
df_outliers_rmvd=df[df['MEDV']<35]

In [None]:
sns.boxplot(y=df_outliers_rmvd['MEDV'])

In [None]:
# Classification and Regression Trees

In [None]:
# Decision Trees for regression

# Condition based model
# Root node, branches, terminal nodes
# Root node is formed based on the mean squared error
# Generalised model & hence unaffected by the outliers
# Suffers from overfitting if not taken care of the depth of the tree.
# Easy and comprehensive

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
d = DecisionTreeRegressor(max_depth=8)

In [None]:
dtree_model = d.fit(x_train,y_train)

In [None]:
#!pip install dtreeviz

In [None]:
from dtreeviz.trees import dtreeviz

In [None]:
tree_img = dtreeviz(dtree_model, x_train,y_train, target_name='MEDV', feature_names=x_train.columns)

In [None]:
tree_img

In [None]:
# Predict on the test dataset

In [None]:
pred_price_tree = dtree_model.predict(x_test)

In [None]:
np.sqrt(mean_squared_error(y_test,pred_price_tree))

In [None]:
error_tree = pd.DataFrame({'Actual':y_test,'Predicted':pred_price_tree})

In [None]:
error_tree = error_tree.reset_index(drop=True)

In [None]:
error_tree

In [None]:
plt.plot(error_tree)

In [None]:
plt.plot(error_df)

In [None]:
# Ensemble --> Bagging

In [None]:
# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor()

In [None]:
rf_model = rf_model.fit(x_train,y_train)

In [None]:
pred_price_rf = rf_model.predict(x_test)

In [None]:
np.sqrt(mean_squared_error(y_test,pred_price_rf))

In [None]:
df2 = pd.read_csv("D://data/PS_20174392719_1491204439457_log.csv")

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df2['isFraud'].value_counts()

In [None]:
df2['isFlaggedFraud'].value_counts()

In [None]:
df_reg = pd.read_csv("D://data/Regression.csv")

In [None]:
df_reg.head()

In [None]:
df_reg.corr()

In [None]:
df_reg.dropna(inplace=True)

In [None]:
x= df_reg.iloc[:,:6]

In [None]:
y = df_reg['Purchase made']

In [None]:
df_reg.groupby('Job Type').agg({'Purchase made':[np.mean,'max','mean']})

In [None]:
sns.boxplot(y=df_reg['Purchase made'], x=df_reg['Metro City'])

In [None]:
df_reg.head()

In [None]:
dummies=pd.get_dummies(df_reg[['Job Type','Marital Status', 'Education','Metro City']], drop_first=True)

In [None]:
x1 = pd.concat([x.drop(['Job Type','Marital Status', 'Education','Metro City'],axis=1),dummies], axis=1)

In [None]:
x1.dropna

In [None]:
mod_dummies = lr.fit(x1.dropna(),y)