<a href="https://colab.research.google.com/github/Vishwa1030/CODSOFT/blob/main/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import DataConversionWarning

### **Suppress warnings**

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

# **Loading the Dataset**

In [None]:
titan = pd.read_csv('/content/task1.csv')
print(titan)


     PassengerId  Survived  Pclass  \
0            892         0       3   
1            893         1       3   
2            894         0       2   
3            895         0       3   
4            896         1       3   
..           ...       ...     ...   
413         1305         0       3   
414         1306         1       1   
415         1307         0       3   
416         1308         0       3   
417         1309         0       3   

                                             Name     Sex   Age  SibSp  Parch  \
0                                Kelly, Mr. James    male  34.5      0      0   
1                Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                       Myles, Mr. Thomas Francis    male  62.0      0      0   
3                                Wirz, Mr. Albert    male  27.0      0      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   
..                                            ...     ...

In [None]:
titan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [None]:
titan.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
titan.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [None]:
titan.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


# **Data Preprocessing**


 **Define features and target variable**











In [None]:
custom = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

 **Handle missing values for numeric columns**

In [None]:
numeric_custom = ['Age', 'Fare']
numeric_imputer = SimpleImputer(strategy='mean')
titan[numeric_custom] = numeric_imputer.fit_transform(titan[numeric_custom])

**Handle missing values for categorical columns**

In [None]:
categorical_custom = ['Sex', 'Embarked']
categorical_imputer = SimpleImputer(strategy='most_frequent')
titan[categorical_custom] = categorical_imputer.fit_transform(titan[categorical_custom])

**Convert categorical variables to numerical**

In [None]:
titan['Sex'] = titan['Sex'].map({'female': 0, 'male': 1})
titan = pd.get_dummies(titan, columns=['Embarked'])

**Update features after one-hot encoding**

In [None]:
custom.remove('Embarked')

**Remove the original 'Embarked' column**

In [None]:
custom += [col for col in titan.columns if col.startswith('Embarked_')]

### **Data Splitting**

**Split the data**

In [None]:
P = titan[custom]
S = titan[target]

P_train, P_test, S_train, S_test = train_test_split(P, S, test_size=0.2, random_state=42)


### **Model Building and Training**

**Build the model**

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(P_train, S_train)


### **Model Evaluation and Prediction**

**Make predictions**

In [None]:
S_pred = model.predict(P_test)

In [None]:
P_pred = model.predict(S_test_reshaped)

In [None]:
P_test_binary_binary = (P_test_binary.sum(axis=1) > 0).astype(int)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_flat = y_test.ravel()

In [None]:
y_pred_flat = y_pred.ravel()

**Evaluate the model**

In [None]:
accuracy = accuracy_score(S_test, S_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [None]:
accuracy = accuracy_score(P_test_binary_binary, P_pred_binary)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.40


In [None]:
print("Predictions:", P_pred)

Predictions: [0]


In [None]:
print("Predictions:", S_pred)

Predictions: [0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 1 0 1
 0 1 0 1 1 0 0 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1
 0 1 0 1 1 0 0 1 0 1]


In [None]:
for column in P_test_binary.columns:
    unique_values = P_test_binary[column].unique()
    print(f'Unique values in {column}: {unique_values}')

Unique values in Pclass: [1]
Unique values in Sex: [1 0]
Unique values in Age: [1 0]
Unique values in SibSp: [0 1]
Unique values in Parch: [0 1]
Unique values in Fare: [1]
Unique values in Embarked_C: [1 0]
Unique values in Embarked_Q: [0 1]
Unique values in Embarked_S: [0 1]


In [None]:
print(f'Unique values in {column}: {unique_values}')

Unique values in Embarked_S: [0 1]


In [None]:
print("Unique values in y_test:", np.unique(y_test_flat))
print("Unique values in y_pred:", np.unique(y_pred_flat))

Unique values in y_test: [0 1]
Unique values in y_pred: [0 1]


In [None]:
print("Shape of y_test:", y_test_flat.shape)
print("Shape of y_pred:", y_pred_flat.shape)

Shape of y_test: (84,)
Shape of y_pred: (84,)


**Print classification report**

In [None]:
# Assuming P_test_binary_flat and P_pred_binary_flat are NumPy arrays
P_test_binary_flat = P_test_binary_flat.ravel()
P_pred_binary_flat = P_pred_binary_flat.ravel()

# Check lengths
if len(P_test_binary_flat) != len(P_pred_binary_flat):
    print("Error: Lengths of P_test_binary_flat and P_pred_binary_flat are inconsistent.")
else:
    # Print classification report
    print('\nClassification Report:')
    print(classification_report(P_test_binary_flat, P_pred_binary_flat))

Error: Lengths of P_test_binary_flat and P_pred_binary_flat are inconsistent.


In [None]:
S_test_flat = S_test.ravel()  # If S_test is a Pandas Series, use S_test.values.ravel()
S_pred_flat = S_pred.ravel()  # If S_pred is a Pandas Series, use S_pred.values.ravel()

# Check lengths
if len(S_test_flat) != len(S_pred_flat):
    print("Error: Lengths of S_test and S_pred are inconsistent.")
else:
    # Print classification report for S_test and S_pred
    print('\nClassification Report for S_test and S_pred:')
    print(classification_report(S_test_flat, S_pred_flat))

Error: Lengths of S_test and S_pred are inconsistent.


In [None]:
print('\nClassification Report:')
print(classification_report(y_test_flat, y_pred_flat))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

