In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Adding imports for different classifier Algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,classification_report

**Parameters of Classification Algorithms**
n_neighbors: just for KNN - defines the number of data points a new point will be compared with. the most frequent target out of those neighbors will be the new points classification

C, alpha: adjust how restricted the model is. Restriction helps prevent overfitting

max_depth: used for decision tree algorithms. A method of pre pruning that helps prevent overfitting. says that the tree cannot exceed n levels

n_estimators: used for gradient boosting. the number of simple trees that are generated

learning_rate: used for gradient boosting. the strength that new trees learn from the mistakes of old trees. Higher number means more learning. can lead to overfitting if too high.

In [3]:
# defining the function that calls all classification algorithms
def classification_(x,y):
    
    k=KNeighborsClassifier(n_neighbors = 3)
    svc=SVC(C=10)
    d=DecisionTreeClassifier(max_depth=6)
    log=LogisticRegression()
    gbc=GradientBoostingClassifier(n_estimators=1200, learning_rate = 0.1)
    mn=MultinomialNB(alpha=.5)
    rf=RandomForestClassifier(max_depth=4)
    ab=AdaBoostClassifier(n_estimators = 1200)
    
    algos=[k,svc,d,log,gbc,mn,rf,ab]
    algos_name=['KNeigbors','SVC','DecisionTree','LogisticRegr','GradientBoosting','Multinominal','RandomForest','AdaBoost']
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
   
    result=pd.DataFrame(columns=['AccuracyScore','PrecisionScore','RecallScore','f1_Score'],index=algos_name)

    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

    for i in algos:
        
        predict=i.fit(x_train,y_train).predict(x_test)
        
        accuracy.append(accuracy_score(y_test,predict))
        precision.append(precision_score(y_test,predict,average='weighted'))
        recall.append(recall_score(y_test,predict,average='weighted'))
        f1.append(f1_score(y_test,predict,average='weighted'))
      
    result.AccuracyScore=accuracy
    result.PrecisionScore=precision
    result.RecallScore=recall
    result.f1_Score=f1
    
    
    return result.sort_values('f1_Score',ascending=False)

**Strengths and Weaknesses**
SVC: works well regardless of the number of features. Require careful preprocessing of data and parameter tuning. Doesn't scale well when data sample gets very large.

KNN: Doesn't scale well but can be very good for classification

DecisionTree, GradientBoosting, RandomForest, AdaBoost: all tree models scale well with large sample sizes. No preprocessing of data is necessary. Might not work well if there are many features. 

MultinomialNB: Simple. Only one parameter - alpha. Very fast in training and prediction. Good place to start with a new data set.

Logistic Regression: Very fast to train and predict. Work well with large datasets and sparse data. Work well when the data has a large number of features.

In [4]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_ids = test["PassengerId"]

In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    cols = ["SibSp", "Parch", "Fare", "Age"]
    
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    data.Embarked.fillna("U", inplace=True)
    return data

train = clean(train)
test = clean(test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [8]:
# HW 2
famSizes = []
for row in range(train.shape[0]):
    famTotal = train.iloc[row]["SibSp"] + train.iloc[row]["Parch"] + 1
    famSizes.append(famTotal)
train.insert(8, "FamilySize", famSizes)

In [9]:
# HW 2 
famSizes = []
for row in range(test.shape[0]):
    famTotal = test.iloc[row]["SibSp"] + test.iloc[row]["Parch"] + 1
    famSizes.append(famTotal)
test.insert(7, "FamilySize", famSizes)

In [10]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,male,22.0,1,0,7.25,S,2
1,1,1,female,38.0,1,0,71.2833,C,2
2,1,3,female,26.0,0,0,7.925,S,1
3,1,1,female,35.0,1,0,53.1,S,2
4,0,3,male,35.0,0,0,8.05,S,1


In [11]:
# change strings to numbers bc the models don't like strings
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
columns = ["Sex", "Embarked"]
for col in columns:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    print(le.classes_)  
train.head()

['female' 'male']
['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,1,22.0,1,0,7.25,2,2
1,1,1,0,38.0,1,0,71.2833,0,2
2,1,3,0,26.0,0,0,7.925,2,1
3,1,1,0,35.0,1,0,53.1,2,2
4,0,3,1,35.0,0,0,8.05,2,1


In [12]:
# preprocessing using zero mean and unit variance scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train)
#scaler.fit(test)
X_scaled = scaler.transform(train)
X_test_scaled = scaler.transform(train)

from sklearn.decomposition import PCA
# keep the first two principal components of the data
pca = PCA(n_components=2)
# fit PCA model to beast cancer data
pca.fit(X_scaled)
pca.fit(X_test_scaled)

# transform data onto the first two principal components
X_train_pca = pca.transform(X_scaled)
x_test_pca = pca.transform(X_test_scaled)
print("Original shape: {}".format(str(X_scaled.shape)))
print("Reduced shape: {}".format(str(X_train_pca.shape)))

Original shape: (891, 9)
Reduced shape: (891, 2)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = train["Survived"]
X = train.drop("Survived", axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Change the 3 lines below, comment them, do not erase them!
change them so that KNN is applied instead of Logistic Regression and re-submit and see if it did better!

In [14]:
# calling the classification function defined above to see how each algorithm is performing

classification_(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,AccuracyScore,PrecisionScore,RecallScore,f1_Score
GradientBoosting,0.820225,0.821502,0.820225,0.820757
LogisticRegr,0.814607,0.813232,0.814607,0.813676
RandomForest,0.814607,0.812721,0.814607,0.812981
DecisionTree,0.803371,0.801032,0.803371,0.799975
AdaBoost,0.792135,0.791524,0.792135,0.791803
SVC,0.685393,0.683665,0.685393,0.644618
KNeigbors,0.646067,0.640876,0.646067,0.642964
Multinominal,0.651685,0.634245,0.651685,0.632429


In [15]:
clf = RandomForestClassifier(max_depth=4).fit(X_train, y_train)
# Since GradientBoostingClassifier did best according to the table above, we are trying that method and our score got higher
#clf = DecisionTreeClassifier(max_depth=6).fit(X_train, y_train)

In [16]:
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predictions)

0.8100558659217877

In [17]:
submission_preds = clf.predict(test)

In [18]:
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_preds,
                  })

In [19]:
df.to_csv("submission.csv", index=False)