In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import svm, tree
import xgboost
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

plt.rcParams['figure.figsize'] = [10, 8]

In [88]:
# Fetch the file
data_file = project.get_file("problem_train.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
data_file.seek(0)
train = pd.read_csv(data_file)
train.shape

(10000, 12)

In [89]:
# Getting all clomuns
for i in train.columns:
    print(i) 
    
#train.head(0)

Severity
Safety_Score
Days_Since_Inspection
Total_Safety_Complaints
Control_Metric
Turbulence_In_gforces
Cabin_Temperature
Accident_Type_Code
Max_Elevation
Violations
Adverse_Weather_Metric
Accident_ID


In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
Severity                   10000 non-null object
Safety_Score               10000 non-null float64
Days_Since_Inspection      10000 non-null int64
Total_Safety_Complaints    10000 non-null int64
Control_Metric             10000 non-null float64
Turbulence_In_gforces      10000 non-null float64
Cabin_Temperature          10000 non-null float64
Accident_Type_Code         10000 non-null int64
Max_Elevation              10000 non-null float64
Violations                 10000 non-null int64
Adverse_Weather_Metric     10000 non-null float64
Accident_ID                10000 non-null int64
dtypes: float64(6), int64(5), object(1)
memory usage: 937.6+ KB


In [91]:
train.head()

Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,Minor_Damage_And_Injuries,49.223744,14,22,71.285324,0.272118,78.04,2,31335.476824,3,0.424352,7570
1,Minor_Damage_And_Injuries,62.465753,10,27,72.288058,0.423939,84.54,2,26024.711057,2,0.35235,12128
2,Significant_Damage_And_Fatalities,63.059361,13,16,66.362808,0.322604,78.86,7,39269.053927,3,0.003364,2181
3,Significant_Damage_And_Serious_Injuries,48.082192,11,9,74.703737,0.337029,81.79,3,42771.4992,1,0.211728,5946
4,Significant_Damage_And_Fatalities,26.484018,13,25,47.948952,0.54114,77.16,3,35509.228515,2,0.176883,9054


In [92]:
train.describe()

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,41.876406,12.9311,6.5643,65.145324,0.381495,79.969331,3.8149,32001.803282,2.0122,0.255635,6266.5542
std,16.138072,3.539803,6.971982,11.882934,0.121301,2.759739,1.902577,9431.995196,1.03998,0.381128,3610.170288
min,0.0,1.0,0.0,0.0,0.134,74.74,1.0,831.695553,0.0,0.000316,2.0
25%,30.593607,11.0,2.0,56.927985,0.293665,77.96,2.0,25757.636908,1.0,0.012063,3139.75
50%,41.278539,13.0,4.0,65.587967,0.365879,79.54,4.0,32060.336419,2.0,0.074467,6280.5
75%,52.511416,15.0,9.0,73.336372,0.451346,81.56,5.0,38380.641513,3.0,0.354059,9391.5
max,100.0,23.0,54.0,100.0,0.882648,97.51,7.0,64297.651218,5.0,2.365378,12500.0


In [93]:
# Checking for columns with null values
null_cols = [i for i in train.columns if train[i].isnull().any()]
null_cols

[]

In [94]:
# Getting all prediction labels
for i in train.Severity.unique():
    print(i)

Minor_Damage_And_Injuries
Significant_Damage_And_Fatalities
Significant_Damage_And_Serious_Injuries
Highly_Fatal_And_Damaging


In [95]:
train.Accident_Type_Code.unique()

array([2, 7, 3, 4, 1, 6, 5])

In [96]:
#t=np.sort(train.iloc[:,2])
#plt.plot(t, '--bo')
#plt.show()

In [97]:
map = {'Minor_Damage_And_Injuries': 1,
       'Significant_Damage_And_Fatalities': 2,
       'Significant_Damage_And_Serious_Injuries': 3,
       'Highly_Fatal_And_Damaging': 4}
inverted_dict = dict([[v,k] for k,v in map.items()])
train['Severity'] = train['Severity'].map(map)

In [98]:
train.head()

Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,1,49.223744,14,22,71.285324,0.272118,78.04,2,31335.476824,3,0.424352,7570
1,1,62.465753,10,27,72.288058,0.423939,84.54,2,26024.711057,2,0.35235,12128
2,2,63.059361,13,16,66.362808,0.322604,78.86,7,39269.053927,3,0.003364,2181
3,3,48.082192,11,9,74.703737,0.337029,81.79,3,42771.4992,1,0.211728,5946
4,2,26.484018,13,25,47.948952,0.54114,77.16,3,35509.228515,2,0.176883,9054


In [99]:
#train = train.iloc[:,0:11]
#train.head()

In [79]:
# Conversion to category type
#train["Accident_Type_Code"] = train["Accident_Type_Code"].astype('category')
#train["Severity"] = train["Severity"].astype('category')

In [80]:
#correlation = train.corr()
#correlation
#sns.heatmap(correlation, cmap = 'coolwarm')

In [123]:
# Train-test-split
X, y = train.iloc[:,1:], train.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=456)

In [124]:
# Scaling
scaler = preprocessing.MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train.to_numpy())
scaled_X_test = scaler.transform(X_test.to_numpy())
X_train = pd.DataFrame(scaled_X_train)
X_test = pd.DataFrame(scaled_X_test)

In [125]:
X_train.columns = train.columns[1:]
X_test.columns = train.columns[1:]

In [126]:
classifiers=[]
#model1 = xgboost.XGBClassifier(random_state=123)
#classifiers.append(model1)
#model2 = svm.SVC(random_state=123)
#classifiers.append(model2)
model3 = tree.DecisionTreeClassifier(random_state=123)
classifiers.append(model3)
model4 = RandomForestClassifier(random_state=123)
classifiers.append(model4)
model5 = BaggingClassifier(random_state=123,base_estimator=model3, n_estimators=60)
classifiers.append(model5)
#model6 = AdaBoostClassifier(random_state=123)
#classifiers.append(model6)
model7 = GradientBoostingClassifier(random_state=123)
classifiers.append(model7)

In [127]:
for clf in classifiers:
    #clf.fit(X_train, y_train)
    #y_pred= clf.predict(X_test)
    #acc = accuracy_score(y_test, y_pred)
    #print("Accuracy of %s is %s"%(clf, acc))
    kfold = KFold(n_splits=5, random_state=123)
    cv_results = cross_val_score(clf, X_train, y_train, cv=kfold, scoring='f1_weighted')
    print("______________________________")
    print("CV score: "+str(cv_results.mean()))
    print("//////////////////////////////////////////////////////////////")

______________________________
CV score: 0.9156319103053094
//////////////////////////////////////////////////////////////
______________________________
CV score: 0.8787399709240591
//////////////////////////////////////////////////////////////
______________________________
CV score: 0.943048457615177
//////////////////////////////////////////////////////////////
______________________________
CV score: 0.8648099963568917
//////////////////////////////////////////////////////////////


In [None]:
# model5 - Bagging Classifier gives best CV Score

In [21]:
data_file = project.get_file("problem_test.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
data_file.seek(0)
test = pd.read_csv(data_file)
test_scaled = scaler.transform(test)

In [22]:
model5.fit(X_train, y_train)
y_pred= model5.predict(test_scaled)

In [243]:
y_pred

array([4, 2, 3, ..., 3, 3, 4])

In [23]:
submit = pd.DataFrame([test.Accident_ID,y_pred])
submit = submit.T
submit.columns = ['Accident_ID','Severity']
submit.describe()
submit['Severity']=submit['Severity'].map(inverted_dict)

In [28]:
submit.reindex(submit.Accident_ID)

Unnamed: 0_level_0,Accident_ID,Severity
Accident_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,Significant_Damage_And_Fatalities
10,95.0,Significant_Damage_And_Serious_Injuries
14,129.0,Significant_Damage_And_Fatalities
17,137.0,Highly_Fatal_And_Damaging
21,166.0,Significant_Damage_And_Serious_Injuries
40,231.0,Minor_Damage_And_Injuries
45,240.0,Highly_Fatal_And_Damaging
50,261.0,Minor_Damage_And_Injuries
52,266.0,Significant_Damage_And_Fatalities
85,433.0,Highly_Fatal_And_Damaging
