In [None]:
# Import Libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load datasets using absolute paths
train_data = pd.read_csv('/Users/shambhavi/Desktop/logistic regression/train.csv')
test_data = pd.read_csv('/Users/shambhavi/Desktop/logistic regression/test.csv')
submission_data = pd.read_csv('/Users/shambhavi/Desktop/logistic regression/sample_submission.csv')

# Display the first few rows of the training data
print("Train Data:")
print(train_data.head())

# Display the first few rows of the test data
print("\nTest Data:")
print(test_data.head())

In [None]:
# Plot the distribution of numerical features
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'CH2O', 'FAF', 'TUE']
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True)
    plt.title(f"Distribution of {feature}")
    plt.show()

In [None]:
# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Encode categorical features
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
target = 'NObeyesdad'

encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

# Encode the target variable
df[target] = encoder.fit_transform(df[target])

# Drop the 'id' column
df.drop('id', axis=1, inplace=True)

In [None]:
# SPLIT DATA INTO TRAINING AND TESTING SETS
# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
# Visualize the importance of features using Logistic Regression coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': logistic_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients)
plt.title("Feature Importance - Logistic Regression Coefficients")
plt.show()

In [None]:
# PREPARE TEST PREDICTIONS
# Load test data
test_data = pd.read_csv('test.csv')

# Preprocess the test data (similar steps as train data)
for col in categorical_columns:
    test_data[col] = encoder.transform(test_data[col])

test_data.drop('id', axis=1, inplace=True)
test_data = scaler.transform(test_data)  # Scale the test data

# Make predictions on the test data
test_predictions = logistic_model.predict(test_data)

# Save predictions to the submission file
submission = pd.read_csv('sample_submission.csv')
submission['NObeyesdad'] = encoder.inverse_transform(test_predictions)
submission.to_csv('submission.csv', index=False)