In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading data
df = pd.read_csv(r'C:\Users\Hp\Desktop\work\machine-learning-assignment-1-group-2\notebooks\student2_success_preprocessed.csv')
df.head()

In [None]:
df.info()

In [None]:
print(df.columns)


In [None]:
df = df.drop(['student_id'], axis=1)

# Preparing Data For Model Traing

In [None]:
# Define the target variable and the features
X = df.drop('final_grade', axis=1)
y = df['final_grade']  # Target variable

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train Linear Regression Model (Baseline Model)

Now, we will train the first model, a simple linear regression model.

In [None]:
# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression Model - MSE: {mse}, R2: {r2}")

# Train Linear Regression Model (Baseline Model)

Now, we will train the first model, a simple linear regression model.

In [None]:
# Initialize and train the Ridge regression model
ridge_model = Ridge(alpha=1.0)  # alpha is the regularization strength
ridge_model.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression Model - MSE: {mse_ridge}, R2: {r2_ridge}")

# Train Lasso Regression Model

Similarly, we can train a Lasso Regression model, which also uses regularization but with L1 penalty.

In [None]:
# Initialize and train the Lasso regression model
lasso_model = Lasso(alpha=0.1)  # alpha is the regularization strength
lasso_model.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso_model.predict(X_test)

# Evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression Model - MSE: {mse_lasso}, R2: {r2_lasso}")

# Compare Model Performance

Now, let's compare the performance of the three models based on their Mean Squared Error (MSE) and R2 scores.

In [None]:
# Create a comparison DataFrame
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'MSE': [mse, mse_ridge, mse_lasso],
    'R2': [r2, r2_ridge, r2_lasso]
})

# Display the comparison
print(model_comparison)

We have already performed some model evaluations, specifically for three types of regression models: Linear Regression, Ridge Regression, and Lasso Regression. The results you provided show the Mean Squared Error (MSE) and R-squared (R²) values for each model:

1. **Linear Regression**:
   - **MSE**: 10.41
   - **R²**: 0.50

2. **Ridge Regression**:
   - **MSE**: 10.95
   - **R²**: 0.47

3. **Lasso Regression**:
   - **MSE**: 10.59
   - **R²**: 0.49

### Analysis:
- **MSE**: A lower value of MSE indicates a better fit of the model to the data. In this case, **Linear Regression** has the lowest MSE.
- **R²**: The R² score indicates how well the model explains the variability in the target variable. A higher R² value indicates a better model fit. Here, **Linear Regression** also has the highest R².

### Conclusion:
- **Linear Regression** is the best performing model based on these metrics, as it has the lowest MSE and the highest R².

 # 5. Analysis & Interpretation (All team members)

# 1. Interpret Model Coefficients

In [None]:
# Interpreting the model coefficients
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Linear Regression Coefficients': lr_coefficients,
    'Ridge Regression Coefficients': ridge_coefficients,
    'Lasso Regression Coefficients': lasso_coefficients
})

coefficients_df.set_index('Feature', inplace=True)
coefficients_df

# 2: Identify the Most Influential Features

In [None]:
# Sorting coefficients by absolute value to identify the most influential features
coefficients_df['Linear Regression Abs'] = coefficients_df['Linear Regression Coefficients'].abs()
coefficients_df['Ridge Regression Abs'] = coefficients_df['Ridge Regression Coefficients'].abs()
coefficients_df['Lasso Regression Abs'] = coefficients_df['Lasso Regression Coefficients'].abs()

# Sorting by absolute values for each model
sorted_linear = coefficients_df.sort_values('Linear Regression Abs', ascending=False)
sorted_ridge = coefficients_df.sort_values('Ridge Regression Abs', ascending=False)
sorted_lasso = coefficients_df.sort_values('Lasso Regression Abs', ascending=False)

# Display sorted results
print("Top Influential Features (Linear Regression):")
print(sorted_linear[['Linear Regression Coefficients']].head())

print("\nTop Influential Features (Ridge Regression):")
print(sorted_ridge[['Ridge Regression Coefficients']].head())

print("\nTop Influential Features (Lasso Regression):")
print(sorted_lasso[['Lasso Regression Coefficients']].head())

### **Linear Regression**:
- **study_resources_Online Resources, Tutoring, Study Groups**: -4.93 (Negative impact)
- **study_resources_Tutoring, Study Group**: 2.70 (Positive impact)
- **family_support_Low**: -2.51 (Negative impact)
- **study_resources_Textbooks, Tutoring, Study Group**: -1.73 (Negative impact)
- **program_of_study_Engineering**: 1.53 (Positive impact)

### **Ridge Regression**:
- **family_support_Low**: -2.44 (Negative impact)
- **study_resources_Online Resources, Tutoring, Study Groups**: -2.33 (Negative impact)
- **previous_gpa**: 1.41 (Positive impact)
- **program_of_study_Engineering**: 1.40 (Positive impact)
- **attendance_rate**: 1.40 (Positive impact)

### **Lasso Regression**:
- **family_support_Low**: -1.61 (Negative impact)
- **attendance_rate**: 1.36 (Positive impact)
- **previous_gpa**: 1.30 (Positive impact)
- **participation_score**: 0.94 (Positive impact)
- **previous_course_failures**: -0.88 (Negative impact)

### Interpretation:
1. **Study Resources**: 
   - For **Linear and Ridge Regression**, the feature "Online Resources, Tutoring, Study Groups" has a significant negative impact on the target variable, while other study resource categories show mixed results.
2. **Family Support**: 
   - **Family Support (Low)** has a consistently negative coefficient across all three models, suggesting it has a negative relationship with the target variable.
3. **Previous GPA**: 
   - **Previous GPA** shows a positive relationship with the target variable, especially in **Ridge** and **Lasso** regressions.
4. **Program of Study**: 
   - **Engineering** has a positive impact in both **Linear and Ridge Regression**, indicating it has a favorable effect on the target variable.
5. **Attendance Rate**: 
   - **Attendance Rate** positively influences the target variable in both **Ridge** and **Lasso Regression**, with a notable coefficient in **Lasso**.

### Conclusion:
- **Family Support** and **Study Resources** (especially online and tutoring) appear to have a major influence on the target variable across all models.
- **Previous GPA** and **Attendance Rate** are also important predictors in Ridge and Lasso regressions.
- **Program of Study (Engineering)** is a significant feature in Linear and Ridge regressions.

In [None]:
# Plot the most influential features (sorted by absolute value) for each model
plt.figure(figsize=(14, 8))

# Plot Linear Regression Coefficients
plt.subplot(3, 1, 1)
sorted_linear['Linear Regression Coefficients'].head(10).plot(kind='barh', color='skyblue')
plt.title('Top 10 Influential Features (Linear Regression)', fontsize=16)
plt.xlabel('Coefficient Value', fontsize=14)
plt.ylabel('Features', fontsize=14)

# Plot Ridge Regression Coefficients
plt.subplot(3, 1, 2)
sorted_ridge['Ridge Regression Coefficients'].head(10).plot(kind='barh', color='lightcoral')
plt.title('Top 10 Influential Features (Ridge Regression)', fontsize=16)
plt.xlabel('Coefficient Value', fontsize=14)
plt.ylabel('Features', fontsize=14)

# Plot Lasso Regression Coefficients
plt.subplot(3, 1, 3)
sorted_lasso['Lasso Regression Coefficients'].head(10).plot(kind='barh', color='lightgreen')
plt.title('Top 10 Influential Features (Lasso Regression)', fontsize=16)
plt.xlabel('Coefficient Value', fontsize=14)
plt.ylabel('Features', fontsize=14)

plt.tight_layout()
plt.show()