In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import string

In [2]:
class LR:
    
    def __init__(self,l_rate=0.001,n_iter=1000):
        self.l_rate=l_rate
        self.n_iter= n_iter
        self.bias= None
        self.weights=None
        
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def fit(self,X,Y):
        #initialize weights and bias
        n_sample,n_feature=X.shape
        self.weights= np.zeros(n_feature).reshape(-1,1)
        self.bias=0

        #gradient descent
        for _ in range(self.n_iter):
            
            l_model= np.dot(X,self.weights) + self.bias  # phi(x) = wx+b
            yhat = self.sigmoid(l_model)
            
            #error calculation and derivative
            dw= (1/n_sample)* np.dot(X.T,yhat-Y)
            db= (1/n_sample)* np.sum(yhat-Y)
            
            #update weights
            self.weights-= np.multiply(self.l_rate,dw)
            self.bias-=self.l_rate*db
        
    
    def predict(self,X):
        
        linear_model= np.dot(X,self.weights)+self.bias
        y_predicted= self.sigmoid(linear_model)
        
        y_class =[1 if i>0.5 else 0 for i in y_predicted]
        
        return y_class


In [3]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    print (big_string)
    return np.nan

In [4]:
#Feature Engineering

def title(df):
    
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    return df

def replace_titles(x):
    
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    


In [5]:
def cleaned_data(data):
    
    #cleaning
    mean_value=round(np.mean(data["Age"]),2)
    data["Age"].fillna(value=mean_value, inplace=True)
    data["Cabin"].fillna(value="Unknown",inplace=True)
    data["Embarked"].fillna(value="S",inplace=True)
    data=data.set_index("PassengerId")
    
    #label encoding
    label_encoder = preprocessing.LabelEncoder()
    # Encode labels in column 'species'.
    data["Sex"]= label_encoder.fit_transform(data["Sex"])
    data["Embarked"]= label_encoder.fit_transform(data["Embarked"]) 
    
    #extracting titles
    data=title(data)
    data['Title']=data.apply(replace_titles, axis=1)
    
    #cabin 
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    data['Deck']=data['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    
    #family members
    data['Family_members']= data['SibSp']+data['Parch']

    #normalising Fare 
    min_fare=data["Fare"].min()
    max_fare=data["Fare"].max()
    for i in range(len(data["Fare"])):
        data["Fare"].iloc[i]= (data["Fare"].iloc[i]-min_fare)/(max_fare-min_fare)

    #normalising age
    min_age=data["Age"].min()
    max_age=data["Age"].max()
    for i in range(len(data["Age"])):
        data["Age"].iloc[i]= ( data["Age"].iloc[i]-min_age)/(max_age-min_age)

    
    data= data.drop(["Name","SibSp","Parch","Ticket","Cabin"],axis=1)
    
    
    #encoding Deck and Title:
    data["Title"]= label_encoder.fit_transform(data["Title"])
    data["Deck"]= label_encoder.fit_transform(data["Deck"])
    
    return data

    

In [12]:
train=pd.read_csv('train.csv')
cleaned_dat=cleaned_data(train)
cleaned_dat.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_members
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,1,0.271174,0.014151,2,2,8,1
2,1,1,0,0.472229,0.139136,0,3,2,1
3,1,3,0,0.321438,0.015469,2,1,8,0
4,1,1,0,0.434531,0.103644,2,3,2,1
5,0,3,1,0.434531,0.015713,2,2,8,0


In [13]:
#splitting data
x_train=cleaned_dat.iloc[:,1:]
y_train=cleaned_dat['Survived'].to_numpy().reshape(-1,1)

In [15]:
#calling model
lr_all= LR(0.001,100000)
lr_all.fit(x_train,y_train)
y_predicted= lr_all.predict(x_train)

#calculating accuracy
accuracy_score(y_train, y_predicted)

0.7957351290684624

In [9]:
#testing set
test=pd.read_csv("test.csv")
x_test=cleaned_data(test)

y_predicted_test=lr_all.predict(x_test)

y_predicted_test=pd.DataFrame(y_predicted_test)
y_predicted_test.index=x_test.index

y_predicted_test.columns = ["Survived"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [10]:
y_predicted_test.to_csv('submission2.csv')

In [11]:
testset=pd.read_csv("submission2.csv")
testset.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))