# **Model Training**

**Libraies**
- Pandas

- Numpy

- Matplotlib

- Seaborn

- Scikit-learn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

**Load Dataset**

In [3]:
df=pd.read_csv('Data/Cleaned.csv')
df

Unnamed: 0.1,Unnamed: 0,Segment ID,PCI,Road Type,AADT,Asphalt Type,Last Maintenance,Average Rainfall,Rutting,IRI,Needs Maintenance,PCI level,AR level,Rutting level,AADT level,IRI level,Maintenance prevention
0,0,SID 155440,70.00,Secondary,4634,Concrete,2022,42.07,15.11,0.68,0,very good,medium,severe,low,very good,Routine Maintenance
1,1,SID 244365,37.87,Tertiary,3474,Asphalt,2017,79.60,19.67,0.99,1,poor,medium,severe,low,very good,Preventive Maintenance
2,2,SID 137867,88.32,Tertiary,1734,Concrete,2019,62.69,14.73,0.50,0,excellent,medium,severe,low,very good,Preventive Maintenance
3,3,SID 540519,52.30,Tertiary,3119,Asphalt,2008,79.32,17.01,1.04,1,fair,medium,severe,low,very good,Preventive Maintenance
4,4,SID 745776,49.10,Tertiary,3505,Asphalt,2019,71.81,21.08,1.18,1,fair,medium,very severe,low,very good,Preventive Maintenance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016788,1049994,SID 980091,49.66,Secondary,567,Concrete,2020,54.76,14.61,0.86,0,fair,medium,severe,very low,very good,Preventive Maintenance
1016789,1049995,SID 766608,60.37,Tertiary,2048,Concrete,2018,62.95,11.60,0.52,0,good,medium,severe,low,very good,Preventive Maintenance
1016790,1049996,SID 594559,89.70,Secondary,3792,Concrete,2019,54.06,13.06,0.58,0,excellent,medium,severe,low,very good,Preventive Maintenance
1016791,1049997,SID 345069,75.59,Secondary,33876,Concrete,2019,54.52,16.78,0.53,0,very good,medium,severe,very high,very good,Preventive Maintenance


**Setting Index**

In [4]:
df.set_index(df['Segment ID'],inplace=True)

In [5]:
df.drop(columns=['Unnamed: 0','Segment ID'],inplace=True)

In [6]:
index=df.index

**Divide Dataframe into Categorical Features DataFrame and Numerical Features DataFrame**

In [7]:
obj_df=df.select_dtypes(include=['object'])
num_df=df.select_dtypes(include=['float','int'])

In [8]:
encoder=OrdinalEncoder()
norm=MinMaxScaler()

**Encodding and Scaling Datasets**

In [9]:
encoded=encoder.fit_transform(obj_df)
normed=norm.fit_transform(num_df)

**Concat DataFrames**

In [10]:
encodded_df=pd.DataFrame(encoded,columns=obj_df.columns)
normed_df=pd.DataFrame(normed,columns=num_df.columns)

In [11]:
final_df=pd.concat([encodded_df,normed_df],axis=1)
final_df.set_index(index,inplace=True)

In [12]:
final_df

Unnamed: 0_level_0,Road Type,Asphalt Type,PCI level,AR level,Rutting level,AADT level,IRI level,Maintenance prevention,PCI,AADT,Last Maintenance,Average Rainfall,Rutting,IRI,Needs Maintenance
Segment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
SID 155440,1.0,1.0,5.0,2.0,1.0,2.0,1.0,1.0,0.7000,0.023153,0.969231,0.324014,0.467694,0.356021,0.0
SID 244365,2.0,0.0,4.0,2.0,1.0,2.0,1.0,0.0,0.3787,0.017357,0.892308,0.613062,0.737996,0.518325,1.0
SID 137867,2.0,1.0,0.0,2.0,1.0,2.0,1.0,0.0,0.8832,0.008664,0.923077,0.482825,0.445169,0.261780,0.0
SID 540519,2.0,0.0,2.0,2.0,1.0,2.0,1.0,0.0,0.5230,0.015583,0.753846,0.610906,0.580320,0.544503,1.0
SID 745776,2.0,0.0,2.0,2.0,2.0,2.0,1.0,0.0,0.4910,0.017512,0.923077,0.553065,0.821577,0.617801,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SID 980091,1.0,1.0,2.0,2.0,1.0,5.0,1.0,0.0,0.4966,0.002833,0.938462,0.421750,0.438056,0.450262,0.0
SID 766608,2.0,1.0,3.0,2.0,1.0,2.0,1.0,0.0,0.6037,0.010232,0.907692,0.484827,0.259632,0.272251,0.0
SID 594559,1.0,1.0,0.0,2.0,1.0,2.0,1.0,0.0,0.8970,0.018946,0.923077,0.416359,0.346177,0.303665,0.0
SID 345069,1.0,1.0,5.0,2.0,1.0,4.0,1.0,0.0,0.7559,0.169253,0.923077,0.419901,0.566686,0.277487,0.0


In [13]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1016793 entries, SID 155440 to SID 463864
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Road Type               1016793 non-null  float64
 1   Asphalt Type            1016793 non-null  float64
 2   PCI level               1016793 non-null  float64
 3   AR level                1016793 non-null  float64
 4   Rutting level           1016793 non-null  float64
 5   AADT level              1016793 non-null  float64
 6   IRI level               1016793 non-null  float64
 7   Maintenance prevention  1016793 non-null  float64
 8   PCI                     1016793 non-null  float64
 9   AADT                    1016793 non-null  float64
 10  Last Maintenance        1016793 non-null  float64
 11  Average Rainfall        1016793 non-null  float64
 12  Rutting                 1016793 non-null  float64
 13  IRI                     1016793 non-null  float64


**Assign independent and dependent variable**

In [14]:
x=final_df.drop(columns=['Needs Maintenance'])
y=final_df['Needs Maintenance']

**Split data into training and testing**

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

**Logistic Regression**

In [16]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
round(lr.score(x_test,y_test)*100,2)

99.98

**Decision Tree**

In [17]:
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)
dtc.score(x_test,y_test)*100

99.95131762056265

**Random Forest**

In [18]:
rfc=RandomForestClassifier(n_estimators=200)
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)*100

99.99557432914206

**Models and there scores**

In [19]:
print(f"Logistic Regression : {round(lr.score(x_test,y_test)*100,2)}")
print(f"Decision Tree : {round(dtc.score(x_test,y_test)*100,2)}")
print(f"Random Forest : {round(rfc.score(x_test,y_test)*100,2)}")

Logistic Regression : 99.98
Decision Tree : 99.95
Random Forest : 100.0


In [20]:
import joblib
joblib.dump(rfc,'pavementmodel.pkl')

['pavementmodel.pkl']