In [None]:
# data analysis and wrangling
#https://www.kaggle.com/startupsci/titanic-data-science-solutions
    
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Load the data
aviation = pd.read_csv('GeneralAviationUSLLClean.csv',encoding='latin1',low_memory=False)
aviation.shape

In [None]:
#Split into train and test set
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
df = aviation[['WeatherCondition','BroadPhaseofFlight','PurposeofFlight', "Lethality"]]
df = df.dropna()
df["Wx"]= df["WeatherCondition"].astype('category')
df["Phase"]= df["BroadPhaseofFlight"].astype('category')
df["Purpose"]= df["PurposeofFlight"].astype('category')

#Categories CWx indicates weather condition. CPhase indicates phase in flight accident occurred. CPurpose indicates the purpose of fight.
df["CWx"]= df["Wx"].cat.codes
df["CPhase"]= df["Phase"].cat.codes
df["CPurpose"]= df["Purpose"].cat.codes

In [None]:
df_final = df[["Lethality",'CWx','CPhase','CPurpose']]
df_final.dtypes

In [None]:
#Weather condition
df_final["CWx"].hist()

In [None]:
#Phase of flight accident occurred
df_final["CPhase"].hist()

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
df.shape, train_df, test_df

In [None]:
train_rg = train_df[['Lethality','CWx','CPhase','CPurpose']]
X_train = train_df[['CWx','CPhase','CPurpose']]
Y_train = train_df["Lethality"]
X_test  = test_df[['CWx','CPhase','CPurpose']]
Y_test  = test_df[['Lethality']]
X_train.shape, Y_train.shape, X_test.shape

In [None]:
from sklearn.model_selection import cross_val_score
X=df_final[['CWx','CPhase','CPurpose']]
y=df_final['Lethality']

In [None]:
from sklearn.metrics import mean_absolute_error, average_precision_score
from sklearn.metrics import recall_score, f1_score, mean_squared_error

def rmse(Y_test, Y_pred): 
    return 
    np.sqrt(((Y_test - Y_pred) ** 2).mean())
    
def display_scores(Y_test, Y_pred):
    mae=mean_absolute_error(Y_test, Y_pred)
    precision=average_precision_score(Y_test, Y_pred)
    recall=recall_score(Y_test, Y_pred)
    f1=f1_score(Y_test, Y_pred)

    print("MAE:", mae)
    print("Average Precision Score:", precision)
    print("Recall:", recall)
    print("F1:", f1)
    
def display_cross_val_scores(scores):
    print("Cross Validation Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std()) 

In [None]:
# Linear Regresion
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_test_log = round(logreg.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_log)
print("Accuracy on Test Set:", acc_test_log)

In [None]:
#Display SCore for LogisticalRegresssion
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation Logistical Regression
scores = cross_val_score(logreg,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# Corelation Coefficient
coeff_df = pd.DataFrame(train_rg.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_test_svc = round(svc.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_svc)
print("Accuracy on Test Set:", acc_test_svc)

In [None]:
#Display SCore for SVC
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation SVC
scores = cross_val_score(svc,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
#KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_test_knn = round(knn.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_knn)
print("Accuracy on Test Set:", acc_test_knn)

In [None]:
#Display SCore for KNN
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation KNN
scores = cross_val_score(knn,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# Gaussian
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_test_gaussian = round(gaussian.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_gaussian)
print("Accuracy on Test Set:", acc_test_gaussian)
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation Gaussian
scores = cross_val_score(gaussian,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_test_perceptron = round(perceptron.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_perceptron)
print("Accuracy on Test Set:", acc_test_perceptron)
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation Perceptron
scores = cross_val_score(perceptron,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
#Linear SVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_test_linear_svc = round(linear_svc.score(X_test, Y_test) * 100, 2)

print("Accuracy on Training Set:", acc_linear_svc)
print("Accuracy on Test Set:", acc_test_linear_svc)
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation Linear SVC
scores = cross_val_score(linear_svc,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# SGD
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_test_sgd = round(sgd.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_sgd)
print("Accuracy on Test Set:", acc_test_sgd)
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation SGD
scores = cross_val_score(sgd,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_test_decision_tree = round(decision_tree.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_decision_tree)
print("Accuracy on Test Set:", acc_test_decision_tree)
display_scores(Y_test, Y_pred)

In [None]:
# Cross validation Decision Tree
scores = cross_val_score(decision_tree,X,y,scoring="neg_mean_squared_error",cv=5)
tree_rmse_scores = np.sqrt(-scores)
display_cross_val_scores(tree_rmse_scores)

In [None]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_test_random_forest = round(random_forest.score(X_test, Y_test) * 100, 2)
print("Accuracy on Training Set:", acc_random_forest)
print("Accuracy on Test Set:", acc_test_random_forest)
display_scores(Y_test, Y_pred)

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'ScoreTrainingSet': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],
    'ScoeTestSet': [acc_test_svc, acc_test_knn, acc_test_log, 
              acc_test_random_forest, acc_test_gaussian, acc_test_perceptron, 
              acc_test_sgd, acc_test_linear_svc, acc_test_decision_tree]})

models.sort_values(by='ScoreTrainingSet', ascending=False)

In [None]:
def rmse(predictions, targets): 
    return 
    np.sqrt(((predictions - targets) ** 2).mean())

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)