In [1]:
#Load file from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import the necessary libraries

---



In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and process initial data

In [3]:
# Load the dataset
file_path = '/content/drive/My Drive/Thesis/data.xlsx'
data = pd.read_excel(file_path)

# Preprocess the data by dropping unnecessary columns
data = data[['name', 'gender']]

# Convert the names to strings if not already
data['name'] = data['name'].astype(str)

# Gender label encoding

In [4]:
# Encode the gender labels
data['gender_encoded'] = data['gender'].map({'M': 0, 'F': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['name'] = data['name'].astype(str)


# Model training

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['name'],
    data['gender_encoded'],
    test_size=0.2,
    random_state=42
)

# Vectorize the names
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train the MultinomialNB classifier
classifier = MultinomialNB()
classifier.fit(X_train_vect, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_vect)

# Evaluation

In [6]:
# Calculate accuracy, precision, recall, f1
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 63.53%
Precision: 63.53%
Recall: 100.00%
F1 Score: 77.70%
