In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import RobustScaler
import pickle


In [34]:
# Load the dataset
df = pd.read_csv('titanic dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

In [36]:
import pandas as pd
#filling missing values

# Create a dictionary with the fill values for each column
fill_values = {
    'Age': df['Age'].median(),
    'Fare': df['Fare'].median(),
    'Cabin': 'Unknown',
    'Embarked': df['Embarked'].mode()[0]
}

# Apply fillna to the entire DataFrame using the dictionary
df.fillna(value=fill_values, inplace=True)

# Print the DataFrame after filling missing values
print(df)


      PassengerId  Survived  Pclass  \
0               1         0       3   
1               2         1       1   
2               3         1       3   
3               4         1       1   
4               5         0       3   
...           ...       ...     ...   
1304         1305         0       3   
1305         1306         1       1   
1306         1307         0       3   
1307         1308         0       3   
1308         1309         0       3   

                                                   Name     Sex   Age  SibSp  \
0                               Braund, Mr. Owen Harris    male  22.0      1   
1     Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                                Heikkinen, Miss. Laina  female  26.0      0   
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                              Allen, Mr. William Henry    male  35.0      0   
...                                                

In [38]:
print(df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [40]:


# Drop columns that are not useful for prediction
df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

# Convert categorical variables to numerical
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

# Drop rows with missing values (if any)
df.dropna(inplace=True)


In [42]:
# Define features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
print(X_test)

      PassengerId  Pclass                                    Name     Sex  \
1148         1149       3                   Niklasson, Mr. Samuel    male   
1049         1050       1                Borebank, Mr. John James    male   
982           983       3                      Pedersen, Mr. Olaf    male   
808           809       2                       Meyer, Mr. August    male   
1195         1196       3       McCarthy, Miss. Catherine Katie""  female   
...           ...     ...                                     ...     ...   
572           573       1        Flynn, Mr. John Irwin ("Irving")    male   
140           141       3           Boulos, Mrs. Joseph (Sultana)  female   
1182         1183       3  Daly, Miss. Margaret Marcella Maggie""  female   
312           313       2   Lahtinen, Mrs. William (Anna Sylfven)  female   
199           200       2  Yrois, Miss. Henriette ("Mrs Harbeck")  female   

       Age  SibSp  Parch    Ticket     Fare    Cabin Embarked  
1148  28.0 

In [48]:
# Scale the features
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)
with open('titanic.pkl', 'wb') as file:
    pickle.dump(model, file)

In [52]:
# Make predictions on the test set
y_pred = model.predict(X_test)
print(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report
print(classification_report(y_test, y_pred))


[[ 0.76323988  0.          0.         ...  0.          0.
  -0.26566554]
 [ 0.60903427 -1.          0.         ...  0.          0.
   0.50177029]
 [ 0.5046729   0.          0.         ...  0.          0.
  -0.27707337]
 ...
 [ 0.81619938  0.         -1.         ...  0.          0.
  -0.31129686]
 [-0.53894081 -0.5        -1.         ...  1.          1.
   0.47895463]
 [-0.71495327 -0.5        -1.         ...  0.          0.
  -0.0603246 ]]
Accuracy: 0.8702290076335878
              precision    recall  f1-score   support

           0       0.88      0.91      0.89       159
           1       0.85      0.82      0.83       103

    accuracy                           0.87       262
   macro avg       0.87      0.86      0.86       262
weighted avg       0.87      0.87      0.87       262



In [54]:
# Example of making a prediction for a new passenger
new_passenger = pd.DataFrame({
    'Pclass': [1],
    'Sex': [1],
    'Age': [30],
    'SibSp': [0],
    'Parch': [0],
    'Fare': [70]
})

prediction = model.predict(new_passenger)
print(f'Survival Prediction: {prediction[0]}')




ValueError: X has 6 features, but RandomForestClassifier is expecting 7 features as input.