In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
data_dir = '/content/drive/MyDrive/Codealpha task1/data'

In [34]:
files = os.listdir(data_dir)
print("Files in '/content/drive/MyDrive/Codealpha task1/data' directory:", files)

Files in '/content/drive/MyDrive/Codealpha task1/data' directory: ['data_test.csv', 'data_train.csv']


In [35]:
train_path = os.path.join(data_dir, 'data_train.csv')
test_path = os.path.join(data_dir, 'data_test.csv')

In [36]:
try:
    data_train = pd.read_csv(train_path)
    data_test = pd.read_csv(test_path)
    print("Training Data:")
    print(data_train.head())
    print("\nTest Data:")
    print(data_test.head())
except FileNotFoundError as e:
    print(f"File not found: {e}")
except pd.errors.EmptyDataError as e:
    print(f"Empty data file: {e}")
except pd.errors.ParserError as e:
    print(f"Error parsing file: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Training Data:
   label  Age  Language  Sex  Marital  Has_Credit  Field  Month_of_birth  \
0      0   34         1    2        6           2     13              12   
1      0   38         1    1        5           1     10               7   
2      0   35         1    2        4           2      9               8   
3      0   27         1    1        5           2     13               7   
4      0   32         1    2        4           2     10               7   

   Day_of_birth  Region  Number_of_credits  Linked_cards  INPS_mln_sum  \
0             1      12                  1             0           0.0   
1             1      13                  1             2           0.0   
2             1      13                  4             1           1.0   
3             1      12                  1             2           1.0   
4             1      13                  3             1           2.0   

   INPS_yes_no  Score_level  Score_class Score_point  Changed_phone_number  
0     

In [22]:
print(data_train.isnull().sum())
print(data_test.isnull().sum())

label                   0
Age                     0
Language                0
Sex                     0
Marital                 0
Has_Credit              0
Field                   0
Month_of_birth          0
Day_of_birth            0
Region                  0
Number_of_credits       0
Linked_cards            0
INPS_mln_sum            0
INPS_yes_no             0
Score_level             0
Score_class             0
Score_point             0
Changed_phone_number    0
dtype: int64
label                   0
Age                     0
Language                0
Sex                     0
Marital                 0
Has_Credit              0
Field                   0
Month_of_birth          0
Day_of_birth            0
Region                  0
Number_of_credits       0
Linked_cards            0
INPS_mln_sum            0
INPS_yes_no             0
Score_level             0
Score_class             0
Score_point             0
Changed_phone_number    0
dtype: int64


In [37]:
data_train = data_train.dropna()
data_test = data_test.dropna()

In [38]:
X_train = data_train.drop(columns=['Sex'])
y_train = data_train['Sex']

X_test = data_test.drop(columns=['Sex'])
y_test = data_test['Sex']


In [40]:
# Encode categorical features (excluding the target variable)
label_encoders = {}
for column in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])
    label_encoders[column] = le

In [41]:

X_train = X_train.replace('-', np.nan)
X_test = X_test.replace('-', np.nan)


In [42]:
# Fill NaN values with 0 or another strategy (e.g., mean of the column)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [43]:
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
# Train and evaluate RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [46]:
y_pred_rf = model_rf.predict(X_test)
print("RandomForestClassifier Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

RandomForestClassifier Accuracy:  1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00        32

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48

Confusion Matrix:
 [[16  0]
 [ 0 32]]


In [47]:
# Train and evaluate LogisticRegression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

In [48]:
y_pred_lr = model_lr.predict(X_test)
print("LogisticRegression Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

LogisticRegression Accuracy:  0.7083333333333334
Classification Report:
               precision    recall  f1-score   support

           1       0.60      0.38      0.46        16
           2       0.74      0.88      0.80        32

    accuracy                           0.71        48
   macro avg       0.67      0.62      0.63        48
weighted avg       0.69      0.71      0.69        48

Confusion Matrix:
 [[ 6 10]
 [ 4 28]]
