# Task 3

# This task involves using a dataset, build a predictive model to solve a specific business problem

# Import necessary libraries

In [3]:
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder  
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error  
from sklearn.linear_model import LogisticRegression  
from scipy import stats  
from statsmodels.formula.api import ols  

In [5]:
#load the dataset
data = pd.read_csv('C:\\Users\\Asus\\Downloads\\DATA - 3.csv')
display(data.head(10))

Unnamed: 0,participantID,age,nativeLanguage,gender,education,city,country,responseID,section,cue,R1,R2,R3
0,12,28,URU_R,Fe,4,Montevideo,Uruguay,128,set_2013,bar,abierto,cerveza,noche
1,12,28,URU_R,Fe,4,Montevideo,Uruguay,129,set_2013,tren,expreso,nocturno,bala
2,12,28,URU_R,Fe,4,Montevideo,Uruguay,130,set_2013,mano,libre,derecha,hermano
3,12,28,URU_R,Fe,4,Montevideo,Uruguay,131,set_2013,sopa,fría,Mafalda,verde
4,12,28,URU_R,Fe,4,Montevideo,Uruguay,132,set_2013,especie,ave,Darwin,extinción
5,12,28,URU_R,Fe,4,Montevideo,Uruguay,133,set_2013,mina,linda,minero,carbón
6,12,28,URU_R,Fe,4,Montevideo,Uruguay,134,set_2013,asco,puaj,Freud,feo
7,12,28,URU_R,Fe,4,Montevideo,Uruguay,135,set_2013,gana,pierde,partido,festeja
8,12,28,URU_R,Fe,4,Montevideo,Uruguay,136,set_2013,venta,compra,garage,mercado
9,12,28,URU_R,Fe,4,Montevideo,Uruguay,137,set_2013,iglesia,fuego,cruz,cura


In [7]:
# Encoding categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['nativeLanguage'] = label_encoder.fit_transform(data['nativeLanguage'])
data['country'] = label_encoder.fit_transform(data['country'])


In [9]:
# Select a random sample to speed up processing
data_sample = data.sample(n=10000, random_state=42)


In [11]:
# Define features and target variable
X_sample = data_sample[['age', 'gender', 'nativeLanguage', 'country']]
y_sample = data_sample['education']


In [13]:
# Split the sampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [15]:
# Train the logistic regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)


In [17]:
# Evaluate the model's performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("RMSE: ", rmse)

Accuracy:  0.507
Precision:  0.4867468269650615
Recall:  0.507
RMSE:  0.7486654793697917


In [19]:
# Conduct a t-test (comparing age between genders)
group1 = X_train[X_train['gender'] == 0]['age']
group2 = X_train[X_train['gender'] == 1]['age']
t_statistic, p_value = stats.ttest_ind(group1, group2)

In [21]:
print("T-statistic: ", t_statistic)
print("P-value: ", p_value)


T-statistic:  14.732009209810963
P-value:  1.743843351908537e-48


In [23]:
# Perform regression analysis
formula = 'education ~ age + gender + nativeLanguage + country'
regression_model = ols(formula, data=data_sample).fit()

print(regression_model.summary())


                            OLS Regression Results                            
Dep. Variable:              education   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     39.08
Date:                Thu, 29 Aug 2024   Prob (F-statistic):           1.63e-32
Time:                        10:09:32   Log-Likelihood:                -10189.
No. Observations:               10000   AIC:                         2.039e+04
Df Residuals:                    9995   BIC:                         2.042e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          3.4203      0.038     90.

In [25]:
# Prepare new data for prediction
new_data = pd.DataFrame({'age': [30], 'gender': [1], 'nativeLanguage': [1], 'country': [1]})
prediction = model.predict(new_data)

print("Prediction for new data: ", prediction)

Prediction for new data:  [3]


# Data Loading and Preprocessing:
# -> Handles missing values by filling them with the mode for categorical features.
# -> Encodes categorical variables (gender, nativeLanguage, country) using LabelEncoder.

# Model Training:
# ->The dataset is sampled down to 10,000 rows to speed up processing.
# ->The logistic regression model is trained on this smaller dataset.

# Model Evaluation:
# ->The model's accuracy, precision, recall, and RMSE are calculated

# Statistical Tests:
# ->A t-test is performed to compare the ages between different genders.
# ->An OLS regression is performed to understand the impact of various features on the education level.

# Prediction:
# ->The model makes a prediction on a new data point, demonstrating how to use the model for future predictions.