<a href="https://colab.research.google.com/github/Sudhir22/Plane-Accident-Severity/blob/master/Model_XGBoost%2BCat_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q xlrd
!git clone https://github.com/Sudhir22/Plane-Accident-Severity.git

In [0]:
pip install catboost

In [3]:
import pandas as pd
import xgboost as xbg
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score,f1_score
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier


  import pandas.util.testing as tm


##**Loading and preprocessing Data**


In [0]:
train_data=pd.read_csv('Plane-Accident-Severity/train.csv')

In [0]:
train_data.head()

In [0]:
train_data.groupby(['Severity']).count()

In [0]:
train_data['Severity']=pd.Categorical(train_data['Severity'])
train_data['Severity_Code']=train_data['Severity'].cat.codes

###***Mean Encoding of Categorical Variables***

In [0]:
total_cases_accident_type=dict()
i=1
for index,row in train_data.groupby(['Accident_Type_Code']):
  total_cases_accident_type[i]=row.shape[0]
  i+=1


In [11]:
total_cases_accident_type

{1: 1171, 2: 1791, 3: 1752, 4: 2057, 5: 738, 6: 1215, 7: 1276}

In [0]:
accident_type_code_mean=dict()
for index,row in train_data.groupby(['Accident_Type_Code','Severity_Code']):
  accident_type_code_mean[index]=row.shape[0]/total_cases_accident_type[index[0]]

In [0]:
accident_type_code_mean_values=list()
for index,row in train_data.iterrows():
  accident_type_code_mean_values.append(accident_type_code_mean[(row['Accident_Type_Code'],row['Severity_Code'])])


In [0]:
train_data['Accident_Type_Code_Mean']=pd.Series(accident_type_code_mean_values)

###**Correlation graph**

In [0]:
data_corr=train_data.corr()

In [0]:
sns.heatmap(data_corr)

###**Selecting relevant features from the data**

In [17]:
X=train_data.iloc[:,[1,2,3,4,5,6,7,8,9,10,13]]
X.head()

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_Type_Code_Mean
0,49.223744,14,22,71.285324,0.272118,78.04,2,31335.476824,3,0.424352,0.570073
1,62.465753,10,27,72.288058,0.423939,84.54,2,26024.711057,2,0.35235,0.570073
2,63.059361,13,16,66.362808,0.322604,78.86,7,39269.053927,3,0.003364,0.469436
3,48.082192,11,9,74.703737,0.337029,81.79,3,42771.4992,1,0.211728,0.589612
4,26.484018,13,25,47.948952,0.54114,77.16,3,35509.228515,2,0.176883,0.054224


In [0]:
Y=train_data.iloc[:,12]

###**Training, testing using cross-validation**

In [0]:
kf=KFold(n_splits=5)

In [0]:
param = {
    'eta': 0.2, 
    'max_depth': 12,  
    'objective': 'multi:softprob',  
    'gamma': 0.01,
    'num_class': 4} 

steps = 1000  # The number of training iterations

In [0]:
for x,y in kf.split(X):

  X_train,X_test = X.iloc[x,:],X.iloc[y,:]
  Y_train,Y_test = Y[x],Y[y]
  
  # XGBOOST classifier
  D_train = xbg.DMatrix(X_train, label=Y_train)
  D_test = xbg.DMatrix(X_test, label=Y_test)
  model = xbg.train(param, D_train, steps)
  preds = model.predict(D_test)
  best_preds = np.asarray([np.argmax(line) for line in preds])


  print("Precision = {}".format(precision_score(Y_test, best_preds, average='macro')))
  print("Recall = {}".format(recall_score(Y_test, best_preds, average='macro')))
  print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))
  print("F1-score = {}".format(f1_score(Y_test,best_preds,average='weighted')))
  print("********************************************************************")

  #CATBoost Classifier
  model3=CatBoostClassifier(iterations=1000,learning_rate=0.01,depth=12,loss_function='MultiClass',verbose=False)
  model3.fit(X_train,Y_train)
  best_preds_cat=model3.predict(X_test)
  preds_cat=model3.predict_proba(X_test)
  print("F1-score = {}".format(f1_score(Y_test,best_preds_cat,average='weighted')))


  average_preds=(np.array(preds)+np.array(preds_cat))/2.0
  best_preds_average = np.asarray([np.argmax(line) for line in average_preds])
  print("F1-score = {}".format(f1_score(Y_test,best_preds_average,average='weighted')))

  


###**Applying trained ML Model to the data**

In [0]:
test_data=pd.read_csv("Plane-Accident-Severity/test.csv")

In [0]:
test_X=test_data.iloc[:,0:10]

In [0]:
test_X.head()

In [0]:
D_test_test=xbg.DMatrix(test_X)
test_preds = model.predict(D_test_test)
best_test_preds = np.asarray([np.argmax(line) for line in test_preds])

In [0]:
severity_dict={1:'Minor_Damage_And_Injuries',2:'Significant_Damage_And_Fatalities',3:'Significant_Damage_And_Serious_Injuries',0:'Highly_Fatal_And_Damaging'}

In [0]:
severity_list=list()
for x in best_test_preds:
  severity_list.append(severity_dict[x])

In [0]:
test_data['Severity']=pd.Series(severity_list)

In [0]:
test_data[['Accident_ID','Severity']].to_csv("Plane-Accident-Severity/submission5.csv",index=False)