In [None]:
# Importing libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from scipy import stats

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV, SGDRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#Read/Importing data
df= pd.read_csv(r'FILENAME.csv')

In [None]:
#To Display all the rows and columns of table
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None) 

In [None]:
#Keeping copy of file for future 
df1= df.copy()

In [None]:
## Data Cleaning and Preprocessing

In [None]:
#To display top 5 rows of the table
df.head()

In [None]:
#To display number of rows and columns
df.shape

In [None]:
#To display columns, data types, non-null values
df.info()

In [None]:
#To display data types of columns
df.dtypes

In [None]:
#To display list of columns
df.columns

In [None]:
#To display number of missing values and treat them
df.isnull().sum()

In [None]:
#To check missing values in percentage
(df.isnull().sum()*100/df.isnull().count()).sort_values(ascending = False)

In [None]:
#To visualise null values
sns.heatmap(df.isnull(), cbar=False)
plt.show()

In [None]:
#Droping rows with less missing values
df.dropna(axis=0, inplace=True)

In [None]:
#Dropping columns with many missing values
df.drop(['Column1','Column2','Column3'], axis=1, inplace=True)

In [None]:
#To replace with specific values in one column.
df['column_name'].fillna(value, inplace=True)

In [None]:
#To replace missing values in one column with mean, median or mode.
data['column_name'].fillna(data['column_name'].mean(), inplace=True)

In [None]:
#To display number of duplicate values
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
#To display columns that are numeric and categorical
cat_cols = df.dtypes[df.dtypes=='object'].index
num_cols = df.dtypes[df.dtypes!='object'].index
print(cat_cols)
print(num_cols)

In [None]:
#To display values and its count for categorical columns
for i in cat_cols:
    print(f'Feature {i}')
    print(f'Unique Values - {df[i].nunique()}')
    print(f'Value Counts\n{df[i].value_counts()}')
    print('-'*40)

In [None]:
#### Other basic functions

In [None]:
# Change data type to int or float
df['column_name'] = df['column_name'].astype(int/float)

In [None]:
# Text based data

# remove spaces from start or end of the values.
df['column_name'] = df['column_name'].str.strip()

# Convert text to lowercase
df['column_name'] = df['column_name'].str.lower()

# Remove special characters from text
df['column_name'] = df['column_name'].str.replace('[^\w\s]', '')

# Tokenize text into words
df['column_name'] = df['column_name'].str.split()

In [None]:
# Date time data

# Convert string column to datetime format
df['date_column'] = pd.to_datetime(df['date_column'])

# Extract year from a datetime column
df['year'] = df['date_column'].dt.year

# Extract month from a datetime column
df['month'] = df['date_column'].dt.month

# Extract day from a datetime column
df['day'] = df['date_column'].dt.day

# Calculate time differences
df['time_diff'] = df['end_time'] - df['start_time']

In [None]:
#To rename column name
df.rename(columns={'Old_Column_Name':'New_Column_Name'},inplace=True) 

# To replace specific values in a column
df['column_name'].replace({'old_value': 'new_value'}, inplace=True)

In [None]:
#To remove a column from dataframe
df.drop('name',axis=1,inplace=True)

In [None]:
## EDA - Exploratory Data Analysis

In [None]:
#To display countplot of target variable
sns.countplot(x=df['target'])
plt.title('Countplot for Target')
plt.show()

In [None]:
#To display basic statistical values for numeric columns
df.describe()

In [None]:
# Plot graph between column_name and Dependent variable
df.groupby("column_name").mean()["Dependent_column"].plot(kind='bar')
plt.title("City column_name and Dependent_column Analysis")
plt.show()

In [None]:
# Distribution of column
sns.countplot(df['Column_Name'])
plt.title('Distribution of Column_Name')
plt.xlabel('Column_Name')
plt.savefig('Name1.png')
plt.show()

In [None]:
# Distribution of column with respect to target variable
sns.countplot(x=df['Column_Name'],hue=df['Target_Column'])
plt.title(f'Countplot for Column_Name wrt Target_Column')
plt.savefig('Name2.png')
plt.show()

In [None]:
#To display histogram for all numeric columns
for i in num_cols:
    sns.histplot(x=df[i])
    plt.title(f'Boxplot for {i}')
    plt.show()

In [None]:
##To display count plot for all categorical columns
for i in cat_cols:
    sns.countplot(x=df[i])
    plt.title(f'Countplot for {i}')
    plt.show()

In [None]:
#To display pie chart for all categorical columns
for i in cat_cols:
    count = df[i].value_counts()
    labels = count.index.tolist()
    sizes = count.values.tolist()
    plt.figure()
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.title(f'Pie chart for {i}')
    plt.show()

In [None]:
#To display box plot between numeric and categorical columns
for i in cat_cols:
    sns.boxplot(x=df[i],y=df['Dependent_column'])
    plt.title(f'Boxplot of {i} VS Dependent_column')
    plt.show()

In [None]:
# Scatter plot of two numeric columns
plt.scatter(df['numeric_column1'], df['numeric_column2'])
plt.xlabel('Numeric Column 1')
plt.ylabel('Numeric Column 2')
plt.show()

In [None]:
# Cross-tabulation between two categorical columns
cross_tab = pd.crosstab(df['categorical_column1'], df['categorical_column2'])
print(cross_tab)

In [None]:
# Pair plot
sns.pairplot(data=df)
plt.show()

In [None]:
# Distplot

sns.distplot(df['Column_name'])
plt.show()

# Distplot with log
sns.distplot(np.log(df['Column_name']))
plt.show()

In [None]:
#Multi-variate Analysis

In [None]:
#To display correlation between different columns
corr = df.corr()
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot=True, cmap='RdBu')
plt.show()

In [None]:
# Outlier Treatment

In [None]:
# Remove rows based on a condition
df = df[df['column_name'] != 'value_to_remove']

In [None]:
# Remove outliers based on domain knowledge or business rules
df = df[(df['column_name'] >= lower_threshold) & (df['column_name'] <= upper_threshold)]

In [None]:
df.describe(percentiles=[0.01,0.03,0.05,0.10,0.90,0.95,0.97,0.99]).T

In [None]:
def out_treat(x):
    x = x.clip(upper = x.quantile(0.97))
    x = x.clip(lower = x.quantile(0.01))
    return x

In [None]:
num_cols = df.dtypes[df.dtypes!='object'].index
df[num_cols] = df[num_cols].apply(out_treat)

In [None]:
# Encoding categorical Columns

In [None]:
df = pd.get_dummies(df,columns=cat_cols,drop_first=True)
print(df.shape)
print(df.dtypes)

In [None]:
#### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [None]:
cat_cols = df.dtypes[df.dtypes=='object'].index
print(cat_cols)

In [None]:
for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [None]:
# Get Dummies - OneHotEncoding

In [None]:
df.Column_name = df.Column_name.replace({'Column_values1':'1', 'Column_values2':'2'})

In [None]:
df = pd.get_dummies(df, columns = ['Column_name'],prefix = 'Column_c')

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()
res_ohe = ohe.fit_transform(df[['Column_Name']]).toarray()
res_ohe_df = pd.DataFrame(res_ohe,columns=['Column_values1','Column_values2'])
res_ohe_df.head()

In [None]:
#Select the Dependent and Independent Features
x = x_resampled
y = y_resampled
print(type(x),type(y))
print(x.shape,y.shape)

In [None]:
# Model building

In [None]:
# Creating Function to evalute the model performance

In [None]:
def eval_model(model,x_train,x_test,y_train,y_test):
    global train_score, test_score, mae, mse, rmse, r2, ypred

    model.fit(x_train,y_train)
    train_score = round(model.score(x_train,y_train),2)
    test_score = round(model.score(x_test,y_test),2)
    y_pred = model.predict(x_test)

    mae = round(mean_absolute_error(y_test,y_pred),2)
    mse = round(mean_squared_error(y_test,y_pred),2)
    rmse = round(np.sqrt(mse),2)
    r2 = round(r2_score(y_test,y_pred),2)

    print("Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R²) Score: {r2}")
    
    print(f"Training Score: {train_score}")
    print(f"Testing Score: {test_score}")


In [None]:
score_card = pd.DataFrame(columns=['Model', 'Train Score', 'Test Score', 'Mean Absolute Error (MAE)' , 'Mean Squared Error (MSE)', 
                                   'Root Mean Squared Error (RMSE)', 'R-squared Score'])

def update_score_card(model_name):
    global score_card
    score_card = score_card.append({'Model': model_name, 'Train Score': train_score, 'Test Score': test_score,
                                     'Mean Absolute Error (MAE)' : mae, 'Mean Squared Error (MSE)': mse, 
                                    'Root Mean Squared Error (RMSE)': rmse, 'R-squared Score': r2}, ignore_index = True)
    return(score_card)

In [None]:
#Select the Dependent and Independent Features
x = df.drop('Dependent_column',axis=1)
y = df['Dependent_column']
print(type(x),type(y))
print(x.shape,y.shape)

In [None]:
#Split the data into train and test set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1000)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Linear regression

In [None]:
lr = LinearRegression()
eval_model(lr,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Linear Regression')

In [None]:
# Cross validation of linear regression
model_lr  = LinearRegression()
cv_res = cross_val_score(model_lr,x,y,scoring='r2',cv=5)  # cv = number of splits
print(cv_res)  # we will get 5 different R2 scores for 5 differnet splits
print(cv_res.mean())

In [None]:
# SGDRegressor
sgd1 = SGDRegressor(max_iter=10000,eta0=0.0001,loss='squared_error')
eval_model(sgd1,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'SGD Regression')

In [None]:
# Ridge Regression
ridge1 = Ridge(alpha=1)
eval_model(ridge1,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Ridge Regression')

In [None]:
# RidgeCV Regression
ridge2 = RidgeCV(alphas=np.arange(0.1,1,0.01), cv=20,scoring='neg_mean_squared_error')
eval_model(ridge2,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'RidgeCV Regression')

In [None]:
# Lasso Regression
lasso1 = Lasso(alpha=1)
eval_model(lasso1,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Lasso Regression')

In [None]:
# LassoCV Regression
lasso2 = LassoCV(alphas=np.arange(0.1,1,0.01),cv=20)
eval_model(lasso2,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'LassoCV Regression')

In [None]:
# ElasticNet Regression
enet1 = ElasticNet(alpha=1)
eval_model(enet1,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'ElasiticNet Regression')

In [None]:
# ElasticNetCV Regression
enet2 = ElasticNetCV(alphas=np.arange(0.1,1,0.01),cv=20)
eval_model(enet2,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'ElasticNetCV Regression')

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# DecisionTree Regression  #criterion='squared_error', max_depth=N, min_samples_split=N
dt = DecisionTreeRegressor(criterion='squared_error')
eval_model(dt,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Decision Tree Regression')

In [None]:
# Random Forest Regression  # max_depth=N, max_leaf_nodes=N, min_samples_leaf=N, min_samples_split=N, n_estimators=N
rf = RandomForestRegressor(n_estimators=N, random_state=10)  
eval_model(rf,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Random Forest Regression')

In [None]:
# Support Vector Regression  # kernel='poly', C=N, degree=N, gamma='scale/auto'
sv = SVR()
eval_model(sv,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Support Vector Regression')

In [None]:
# KNN Regression
knr = KNeighborsRegressor(n_neighbors=N)
eval_model(knr,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'KNN Regression')

In [None]:
# XGBoost Regressor
xgb = XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123)
eval_model(rf,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'XGBoost Regression')

In [None]:
poly = PolynomialFeatures(degree=3)
eval_model(rf,x_train,x_test,y_train,y_test)

In [None]:
update_score_card(model_name = 'Polynomial Regression')

In [None]:
# Model Comparision

In [None]:
score_card

In [None]:
score_card['Test Score'].idxmax()

In [None]:
print (score_card['Model'][score_card['Test Score'].idxmax()], "is the best performing model")

In [None]:
# Saving the Model
import pickle
pickle.dump(lr/dt1/rf/knn1/sv1,open('Final model.pkl','wb'))  # select alias of best fit model