# Logistic Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [None]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
from sklearn.impute import SimpleImputer
imputer= SimpleImputer(missing_values=np.nan, strategy='mean')
train_data['Age']=imputer.fit_transform(train_data['Age'].values.reshape(-1,1))
test_data['Age']=imputer.fit_transform(test_data['Age'].values.reshape(-1,1))
test_data['Fare']=imputer.fit_transform(test_data['Fare'].values.reshape(-1,1))
imputer1= SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_data['Embarked']=imputer1.fit_transform(train_data['Embarked'].values.reshape(-1,1))

In [None]:
y_train=train_data['Survived']
print(y_train)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [None]:
features=["Pclass","Age","SibSp","Parch","Fare","Sex","Embarked"]
X_train= pd.get_dummies(train_data[features])
X_test= pd.get_dummies(test_data[features])

In [None]:
print(X_test)

     Pclass       Age  SibSp  ...  Embarked_C  Embarked_Q  Embarked_S
0         3  34.50000      0  ...           0           1           0
1         3  47.00000      1  ...           0           0           1
2         2  62.00000      0  ...           0           1           0
3         3  27.00000      0  ...           0           0           1
4         3  22.00000      1  ...           0           0           1
..      ...       ...    ...  ...         ...         ...         ...
413       3  30.27259      0  ...           0           0           1
414       1  39.00000      0  ...           1           0           0
415       3  38.50000      0  ...           0           0           1
416       3  30.27259      0  ...           0           0           1
417       3  30.27259      1  ...           1           0           0

[418 rows x 10 columns]


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:

y_pred = classifier.predict(X_test)
print(y_pred)



[0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0
 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0
 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0
 0 0 1 0 1 0 0 1 0 0 1]


In [None]:
submission=pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": y_pred})
submission.to_csv('titanic1.csv', index=False)
print(submission)
print("Submitted Successfully")

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
Submitted Successfully
