In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# TRAIN DATA

In [2]:
# Load the data
df = pd.read_csv('SalaryData_Train.csv')

In [3]:
# Preprocessing
# Encode categorical variables
encoder = LabelEncoder()
df['workclass'] = encoder.fit_transform(df['workclass'])
df['education'] = encoder.fit_transform(df['education'])
df['maritalstatus'] = encoder.fit_transform(df['maritalstatus'])
df['occupation'] = encoder.fit_transform(df['occupation'])
df['relationship'] = encoder.fit_transform(df['relationship'])
df['race'] = encoder.fit_transform(df['race'])
df['sex'] = encoder.fit_transform(df['sex'])
df['native'] = encoder.fit_transform(df['native'])
df['Salary'] = encoder.fit_transform(df['Salary'])

In [4]:
# Split the data into training and testing sets
X = df.drop('Salary', axis=1)
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [5]:
# Train the classifier
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB()

In [6]:

# Make predictions on the testing data
y_pred = clf.predict(X_test)

In [7]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [8]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)





Accuracy: 0.7966185977125808
Precision: 0.7795395167924648
Recall: 0.7966185977125808
F1-score: 0.7673347945601158


# TEST DATA

In [9]:
# Load the test data
df_test = pd.read_csv('SalaryData_Test.csv')

In [10]:
encoder=LabelEncoder()
df_test['workclass'] = encoder.fit_transform(df_test['workclass'])
df_test['education'] = encoder.fit_transform(df_test['education'])
df_test['maritalstatus'] = encoder.fit_transform(df_test['maritalstatus'])
df_test['occupation'] = encoder.fit_transform(df_test['occupation'])
df_test['relationship'] = encoder.fit_transform(df_test['relationship'])
df_test['race'] = encoder.fit_transform(df_test['race'])
df_test['sex'] = encoder.fit_transform(df_test['sex'])
df_test['native'] = encoder.fit_transform(df_test['native'])
df_test['Salary'] = encoder.fit_transform(df_test['Salary'])

In [11]:

# Make predictions on the test data
X_test = df_test.drop('Salary', axis=1)
y_test = df_test['Salary']
y_pred = clf.predict(X_test)

In [12]:
# Evaluate the performance on the test data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


In [13]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Accuracy: 0.7951527224435591
Precision: 0.7773336826203999
Recall: 0.7951527224435591
F1-score: 0.76769556528239
