In [None]:
pip install pandas scikit-learn



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# Load the updated dataset
data = pd.read_csv('studentsyay.csv')  # Update this to the correct path

# Define features and target variable
X = data[['gender', 'parental level of education', 'lunch', 'test preparation course', 'hours_of_sleep', 'Mental Health']]
y = data['cgpa']

# Convert categorical variables to numerical
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)

# Calculate performance metrics
rf_rmse = mean_squared_error(y_test, y_pred, squared=False)
rf_r2 = r2_score(y_test, y_pred)
rf_mape = mean_absolute_percentage_error(y_test, y_pred)

# Print the results
print("Random Forest - RMSE:", rf_rmse)
print("Random Forest - R2 Score:", rf_r2)
print("Random Forest - MAPE (Approximate Accuracy):", 1 - rf_mape)


Random Forest - RMSE: 7.5401261427332225
Random Forest - R2 Score: 0.7663603350382473
Random Forest - MAPE (Approximate Accuracy): -718719992675697.4




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

# Load the updated dataset
data = pd.read_csv('studentsyay.csv')  # Update this to the correct path

# Define features and target variable
X = data[['gender', 'parental level of education', 'lunch', 'test preparation course', 'hours_of_sleep', 'Mental Health']]
y = data['cgpa']

# Convert categorical variables to numerical
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBRegressor": XGBRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[model_name] = {'Mean Absolute Error': mae, 'R² Score': r2}

# Create a DataFrame from the results
results_df = pd.DataFrame.from_dict(results, orient='index').reset_index()
results_df.columns = ['Model Name', 'Mean Absolute Error', 'R² Score']

# Sort the DataFrame by R² Score in descending order
results_df_sorted = results_df.sort_values(by='R² Score', ascending=False)

# Print the sorted results
print(results_df_sorted)


                Model Name  Mean Absolute Error  R² Score
4  Random Forest Regressor             5.799194  0.766360
0        Linear Regression             5.709143  0.752958
2                    Ridge             5.689556  0.752079
5             XGBRegressor             6.235182  0.732936
3            Decision Tree             6.625833  0.708956
1                    Lasso             6.262141  0.645697


In [None]:
pip install xgboost



In [None]:
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
data = pd.read_csv('studentsyay.csv')

In [None]:
label_encoders = {}
for column in ['gender', 'parental level of education', 'lunch', 'test preparation course', 'Mental Health']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [None]:
X = data.drop(columns=['cgpa'])
y = data['cgpa']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor()
}

In [None]:
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[model_name] = {'Mean Absolute Error': mae, 'R² Score': r2}


In [None]:
for model_name, metrics in results.items():
    print(f"{model_name} - Mean Absolute Error: {metrics['Mean Absolute Error']}, R² Score: {metrics['R² Score']}")

Linear Regression - Mean Absolute Error: 4.214550528849063, R² Score: 0.87996847200688
Lasso - Mean Absolute Error: 5.016968645898492, R² Score: 0.8336050685822345
Ridge - Mean Absolute Error: 4.217776923490342, R² Score: 0.8798375467369507
Decision Tree - Mean Absolute Error: 6.9875, R² Score: 0.6955936670907674
Random Forest Regressor - Mean Absolute Error: 4.969325952380952, R² Score: 0.83630417105037
XGBRegressor - Mean Absolute Error: 5.4218174171447755, R² Score: 0.8066977858543396


In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,0.879968
2,Ridge,0.879838
7,AdaBoost Regressor,0.843017
5,Random Forest Regressor,0.834779
1,Lasso,0.833605
6,XGBRegressor,0.806698
4,Decision Tree,0.705744
3,K-Neighbors Regressor,0.665817


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = 'stud.csv'  # Update this path if needed
subject_data = pd.read_csv(file_path)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']:
    le = LabelEncoder()
    subject_data[column] = le.fit_transform(subject_data[column])
    label_encoders[column] = le

# Define features and target variable
X = subject_data.drop(columns=['subject1'])
y = subject_data['subject1']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate model performance
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

# Train and evaluate each model
for i in range(len(models)):
    model_name = list(models.keys())[i]
    model = list(models.values())[i]

    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R² Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R² Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.5876
- Mean Absolute Error: 4.4947
- R² Score: 0.8615
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3175
- Mean Absolute Error: 4.1301
- R² Score: 0.8838


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.4474
- Mean Absolute Error: 5.0928
- R² Score: 0.8156
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.3196
- Mean Absolute Error: 4.9831
- R² Score: 0.8359


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.5878
- Mean Absolute Error: 4.4942
- R² Score: 0.8615
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3195
- Mean Absolute Error: 4.1310
- R² Score: 0.8837


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 7.1743
- Mean Absolute Error: 5.7570
- R² Score: 0.7717
-----------------------

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,0.883803
2,Ridge,0.883713
7,AdaBoost Regressor,0.846559
5,Random Forest Regressor,0.845547
1,Lasso,0.835876
6,XGBRegressor,0.824909
4,Decision Tree,0.73167
3,K-Neighbors Regressor,0.66164


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = 'studs.csv'  # Update this path if needed
subject_data = pd.read_csv(file_path)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    subject_data[column] = le.fit_transform(subject_data[column])
    label_encoders[column] = le

# Define features and target variable
X = subject_data.drop(columns=['cgpa'])
y = subject_data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate model performance
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

# Train and evaluate each model
for i in range(len(models)):
    model_name = list(models.keys())[i]
    model = list(models.values())[i]

    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R² Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R² Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.6697
- Mean Absolute Error: 4.5476
- R² Score: 0.8574
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4045
- Mean Absolute Error: 4.2146
- R² Score: 0.8800


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.4757
- Mean Absolute Error: 5.1130
- R² Score: 0.8140
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.3632
- Mean Absolute Error: 5.0170
- R² Score: 0.8336


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.6698
- Mean Absolute Error: 4.5467
- R² Score: 0.8574
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4074
- Mean Absolute Error: 4.2178
- R² Score: 0.8798


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 7.2155
- Mean Absolute Error: 5.7538
- R² Score: 0.7691
-----------------------

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,0.879968
2,Ridge,0.879838
7,AdaBoost Regressor,0.847357
5,Random Forest Regressor,0.833809
1,Lasso,0.833605
6,XGBRegressor,0.806698
4,Decision Tree,0.705446
3,K-Neighbors Regressor,0.665817


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the updated dataset
data = pd.read_csv('studs.csv')  # Update this to the correct path

# Function to assign study hours and sleep hours based on CGPA
def assign_hours(cgpa, max_cgpa, min_cgpa):
    # Normalize CGPA between 0 and 1
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)

    # Assign study hours (1 to 12)
    study_hours = round(normalized_cgpa * 11 + 1)  # Scale to 1-12
    # Assign sleep hours (1 to 8)
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale to 1-8

    return study_hours, sleep_hours

# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['study_hours', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'study_hours', 'hours_of_sleep']]
y = data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)

# Calculate performance metrics
rf_rmse = mean_squared_error(y_test, y_pred, squared=False)
rf_r2 = r2_score(y_test, y_pred)
rf_mape = mean_absolute_error(y_test, y_pred)

# Print the results
print("Random Forest - RMSE:", rf_rmse)
print("Random Forest - R² Score:", rf_r2)
print("Random Forest - MAPE (Approximate Accuracy):", 1 - rf_mape)


Random Forest - RMSE: 2.6672229665211993
Random Forest - R² Score: 0.9707646702980781
Random Forest - MAPE (Approximate Accuracy): -1.051103252899921




In [None]:

non_zero_indices = y_test != 0
y_test_non_zero = y_test[non_zero_indices]
y_pred_non_zero = y_pred[non_zero_indices]

mape = mean_absolute_percentage_error(y_test_non_zero, y_pred_non_zero)
print("MAPE:", mape)

NameError: name 'y_pred' is not defined

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBRegressor": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
}


for model in models.values():
    model.fit(X_train, y_train)


def display_predictions(input_data):
    input_df = pd.DataFrame(input_data)


    predictions = {}
    for model_name, model in models.items():
        pred = model.predict(input_df)
        predictions[model_name] = pred[0]

    return predictions


example_input = {
    'gender': [0],  # Encode appropriately
    'parental_level_of_education': [2],  # Example encoded value
    'lunch': [1],  # Example encoded value
    'test_preparation': [1],  # Example encoded value
    'study_hours': [8],  # Example study hours
    'hours_of_sleep': [5]  # Example sleep hours
}


predictions = display_predictions(example_input)
print("Predicted CGPA based on input:")
for model_name, predicted_cgpa in predictions.items():
    print(f"{model_name}: {predicted_cgpa:.2f}")


Predicted CGPA based on input:
Linear Regression: 62.30
Lasso: 64.00
Ridge: 62.30
Decision Tree: 61.43
Random Forest: 61.45
XGBRegressor: 62.01


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the updated dataset
data = pd.read_csv('studs.csv')  # Make sure to update this path if necessary

# Function to assign study hours and sleep hours based on CGPA
def assign_hours(cgpa, max_cgpa, min_cgpa):
    # Normalize CGPA between 0 and 1
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)

    # Assign study hours (1 to 12)
    study_hours = round(normalized_cgpa * 11 + 1)  # Scale to 1-12
    # Assign sleep hours (1 to 8)
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale to 1-8

    return study_hours, sleep_hours

# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['study_hours', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'study_hours', 'hours_of_sleep', 'lastsemcgpa', 'cgpabefore']]
y = data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = lr_model.predict(X_train)  # Predictions on training data
y_test_pred = lr_model.predict(X_test)  # Predictions on testing data

# Calculate performance metrics
lr_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE for testing data
lr_r2 = r2_score(y_test, y_test_pred)  # R² Score for testing data

# Calculate MAPE (Mean Absolute Percentage Error)
lr_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100  # MAPE as a percentage

# Print the results
print("Linear Regression - RMSE:", lr_rmse)
print("Linear Regression - R² Score:", lr_r2)


Linear Regression - RMSE: 2.0408509142894182
Linear Regression - R² Score: 0.9828836202820601




In [None]:
non_zero_indices = y_test != 0
y_test_non_zero = y_test[non_zero_indices]
y_pred_non_zero = y_pred[non_zero_indices]

mape = mean_absolute_percentage_error(y_test_non_zero, y_pred_non_zero)
print("MAPE:", mape)

NameError: name 'y_pred' is not defined

In [None]:
def assign_hours(cgpa, max_cgpa, min_cgpa):
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)
    study_hours = round(normalized_cgpa * 11 + 1)  # Scale to 1-12
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale to 1-8
    return study_hours, sleep_hours

# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['study_hours', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'study_hours', 'hours_of_sleep', 'lastsemcgpa', 'cgpabefore']]
y = data['cgpa']

# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X, y)

# Function to predict CGPA
def predict_cgpa(input_data):
    # Create a DataFrame for the input data
    input_df = pd.DataFrame(input_data)

    # Make predictions using the trained model
    predicted_cgpa = lr_model.predict(input_df)

    return predicted_cgpa

# Example input data for prediction
example_input = {
    'gender': [1],  # Example encoded value for gender
    'parental_level_of_education': [2],  # Example encoded value
    'lunch': [1],  # Example encoded value
    'test_preparation': [1],  # Example encoded value
    'study_hours': [8],  # Example study hours
    'hours_of_sleep': [5],  # Example hours of sleep
    'lastsemcgpa': [90],  # Example last semester CGPA
    'cgpabefore': [50]  # Example previous CGPA
}

# Predict CGPA based on example input
predicted_cgpa = predict_cgpa(example_input)

# Display the predicted CGPA
print("Predicted CGPA based on input:", predicted_cgpa[0])


Predicted CGPA based on input: 62.36955463545961


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the updated dataset
data = pd.read_csv('studs.csv')  # Update this path if necessary

# Function to assign study hours and sleep hours based on CGPA
def assign_hours(cgpa, max_cgpa, min_cgpa):
    # Normalize CGPA between 0 and 1
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)

    # Assign study hours (1 to 12)
    study_hours = round(normalized_cgpa * 11 + 1)  # Scale to 1-12
    # Assign sleep hours (1 to 8)
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale to 1-8

    return study_hours, sleep_hours

# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['study_hours', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'study_hours', 'hours_of_sleep', 'lastsemcgpa', 'cgpabefore']]
y = data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = lr_model.predict(X_train)  # Predictions on training data
y_test_pred = lr_model.predict(X_test)  # Predictions on testing data

# Calculate performance metrics
lr_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE for testing data
lr_r2 = r2_score(y_test, y_test_pred)  # R² Score for testing data

# Calculate MAPE (Mean Absolute Percentage Error) excluding zero values
non_zero_indices = y_test != 0
y_test_non_zero = y_test[non_zero_indices]
y_pred_non_zero = y_test_pred[non_zero_indices]  # Use y_test_pred instead of y_pred

# Calculate MAPE safely as a decimal
lr_mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero))  # MAPE as a decimal

# Print the results
print("Linear Regression - RMSE:", lr_rmse)
print("Linear Regression - R² Score:", lr_r2)
print("Linear Regression - MAPE (as decimal):", lr_mape)


Linear Regression - RMSE: 2.0408509142894182
Linear Regression - R² Score: 0.9828836202820601
Linear Regression - MAPE (as decimal): 0.02770073655091981




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the updated dataset
data = pd.read_csv('studs.csv')  # Update this path if necessary

# Function to assign study hours and sleep hours based on CGPA
def assign_hours(cgpa, max_cgpa, min_cgpa):
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)
    hours_req_per_unit = round((1 - normalized_cgpa) * 11 + 1)  # Inverse scaling to 1-12
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale sleep hours normally to 1-8

    return hours_req_per_unit, sleep_hours


# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['hours_req_per_unit', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'hours_req_per_unit', 'hours_of_sleep', 'lastsemcgpa', 'cgpabefore']]
y = data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostRegressor()
}

# Initialize a list to hold results
results = []

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_test_pred = model.predict(X_test)  # Predictions on testing data

    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))  # RMSE for testing data
    r2 = r2_score(y_test, y_test_pred)  # R² Score for testing data

    # Calculate MAPE (Mean Absolute Percentage Error) excluding zero values
    non_zero_indices = y_test != 0
    y_test_non_zero = y_test[non_zero_indices]
    y_pred_non_zero = y_test_pred[non_zero_indices]

    # Calculate MAPE safely
    mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero))  # MAPE as a decimal

    # Store the results
    results.append({
        "Model": model_name,
        "RMSE": rmse,
        "R² Score": r2,
        "MAPE": mape
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the DataFrame by R² Score in descending order
results_df_sorted = results_df.sort_values(by='R² Score', ascending=False)

# Display the results
print("Model Performance Metrics (Sorted by R² Score):")
print(results_df_sorted)


Model Performance Metrics (Sorted by R² Score):
               Model      RMSE  R² Score      MAPE
2   Ridge Regression  2.024958  0.983149  0.027259
0  Linear Regression  2.025806  0.983135  0.027275
5            XGBoost  2.264681  0.978923  0.028690
4      Random Forest  2.448691  0.975359  0.030200
1   Lasso Regression  2.556947  0.973132  0.034728
3      Decision Tree  2.795532  0.967884  0.034648
6           AdaBoost  2.813152  0.967478  0.037106


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostRegressor()
}


for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model

def predict_cgpa_all_models(input_data):
    input_df = pd.DataFrame(input_data)  # Create DataFrame from input data
    predictions = {}

    for model_name, model in models.items():
        pred = model.predict(input_df)  # Predict CGPA using the model
        predictions[model_name] = pred[0]  # Store the prediction

    return predictions

example_input = {
    'gender': [1],  # Example encoded value for gender
    'parental_level_of_education': [2],  # Example encoded value
    'lunch': [1],  # Example encoded value
    'test_preparation': [1],  # Example encoded value
    'hours_req_per_unit': [4],  # Example study hours
    'hours_of_sleep': [5],  # Example hours of sleep
    'lastsemcgpa': [90],  # Example last semester CGPA
    'cgpabefore': [88]  # Example previous CGPA
}

# Predict CGPA based on example input using all models
predicted_cgpas = predict_cgpa_all_models(example_input)

# Display the predicted CGPA for each model
print("Predicted CGPA based on input:")
for model_name, predicted_cgpa in predicted_cgpas.items():
    print(f"{model_name}: {predicted_cgpa:.2f}")


Predicted CGPA based on input:
Linear Regression: 71.60
Lasso Regression: 74.59
Ridge Regression: 71.62
Decision Tree: 77.00
Random Forest: 71.41
XGBoost: 66.49
AdaBoost: 72.62


In [None]:
# Install the old version of OpenAI
!pip install openai==0.27.0

Collecting openai==0.27.0
  Downloading openai-0.27.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.52.2
    Uninstalling openai-1.52.2:
      Successfully uninstalled openai-1.52.2
Successfully installed openai-0.27.0


In [None]:
import openai

# Set your OpenAI API Key
# openai.api_key = (Removed for security) 'sk-ofahujN1md_yzLD-lOQtxm1NOR52oo-BPzEVJBUgIkT3BlbkFJMYNDSai0JNkucKQbWYgsJuvPt6kd_hXhJtzy7MS9oA'  # Replace with your actual API key

# Function to generate suggestions using OpenAI API based on provided input
def generate_suggestions(cgpa, hours_req_per_unit, sleep_hours):
    prompt = (
        f"Based on the following student details:\n"
        f"CGPA: {cgpa}\n"
        f"Hours required to complete a unit: {hours_req_per_unit} hours\n"
        f"Hours of sleep: {sleep_hours} hours a day\n"
        f"Please provide personalized suggestions to improve academic performance. "
        f"1. If the CGPA is 10, appreciate the student's effort and suggest maintaining high performance through advanced coursework or leadership roles in study groups.\n"
        f"2. If the CGPA is between 7 and 10, acknowledge their achievements while encouraging them to set higher goals and explore opportunities for enrichment in their studies.\n"
        f"3. If the CGPA is between 5 and 7, do scold them and get them to improve, encourage consistent study habits, recommend utilizing available academic resources like tutoring, and emphasize the importance of time management.\n"
        f"4. If the CGPA is below 5, show disappointment and then provide constructive feedback on the necessity of developing effective study strategies, suggest reaching out for academic support, and emphasize the importance of dedication and effort in their studies."
    )

    # Call the OpenAI API to generate suggestions
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Use the desired model
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=150  # Adjust based on desired response length
    )

    return response['choices'][0]['message']['content']

# Example input for suggestions
cgpa = 9.5  # Example CGPA
hours_req_per_unit = 7  # Example hours required to complete a unit
sleep_hours = 1  # Example hours of sleep

# Generate suggestions
suggestions = generate_suggestions(cgpa, hours_req_per_unit, sleep_hours)
print("Suggestions to improve academic performance:")
print(suggestions)

def generate_study_plan(cgpa, subjects, study_hours_available, exam_date):
    prompt = (
        f"Create a personalized study plan for a student with a CGPA of {cgpa}.\n"
        f"They have the following subjects to study: {', '.join(subjects)}.\n"
        f"They have {study_hours_available} hours available for study each day.\n"
        f"The exam date is {exam_date}.\n"
        f"Please provide a detailed weekly study schedule, including daily topics and recommended resources."
    )

    # Call the OpenAI API to generate the study plan
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500  # Adjust based on desired response length
    )

    return response['choices'][0]['message']['content']

# Example usage
cgpa = 7
subjects = ["Chinese", "DSA", "Machine Learning","Tamil"]
study_hours_available = 10
exam_date = "2024-11-08"

study_plan = generate_study_plan(cgpa, subjects, study_hours_available, exam_date)
print("Personalized Study Plan:")
print(study_plan)

Suggestions to improve academic performance:
Based on the student's CGPA of 9.5, it is clear that they are performing exceptionally well academically. I would like to commend the student for their hard work and dedication that has led to such impressive results. To continue excelling in their academic pursuits, I would suggest exploring advanced coursework or taking on leadership roles in study groups to further enhance their learning experience and knowledge.

Additionally, as the student only gets 1 hour of sleep a day, I would recommend prioritizing adequate rest and creating a study schedule that allows for proper rest and relaxation. Sleep is crucial for cognitive functioning and overall well-being, so it is important to prioritize this aspect of self-care in order to maintain high academic performance.
Personalized Study Plan:
Weekly Study Schedule:

Day 1-3:
- Chinese: Focus on vocabulary building and grammar. Use resources such as textbooks, online language learning platforms, 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_error as mse, root_mean_squared_error

# Load the updated dataset
data = pd.read_csv('studs.csv')  # Update this path if necessary

# Function to assign hours required to complete a unit and sleep hours based on CGPA
def assign_hours(cgpa, max_cgpa, min_cgpa):
    normalized_cgpa = (cgpa - min_cgpa) / (max_cgpa - min_cgpa)
    hours_req_per_unit = round((1 - normalized_cgpa) * 11 + 1)  # Inverse scaling to 1-12
    sleep_hours = round(normalized_cgpa * 7 + 1)  # Scale sleep hours to 1-8
    return hours_req_per_unit, sleep_hours

# Determine max and min CGPA for normalization
max_cgpa = data['cgpa'].max()
min_cgpa = data['cgpa'].min()

# Apply the function to create new columns
data[['hours_req_per_unit', 'hours_of_sleep']] = data['cgpa'].apply(lambda x: assign_hours(x, max_cgpa, min_cgpa)).apply(pd.Series)

# Data Preprocessing
# Encode categorical variables
label_encoders = {}
for column in ['gender', 'parental_level_of_education', 'lunch', 'test_preparation']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target variable for Ridge Regression
X = data[['gender', 'parental_level_of_education', 'lunch', 'test_preparation', 'hours_req_per_unit', 'hours_of_sleep', 'lastsemcgpa', 'cgpabefore']]
y = data['cgpa']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Ridge Regression model
ridge_model = Ridge()

# Train the model
ridge_model.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = ridge_model.predict(X_train)  # Predictions on training data
y_test_pred = ridge_model.predict(X_test)  # Predictions on testing data

# Calculate performance metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))  # RMSE for testing data
ridge_r2 = r2_score(y_test, y_test_pred)  # R² Score for testing data

example_input = {
    'gender': [1],  # Example encoded value for gender
    'parental_level_of_education': [2],  # Example encoded value
    'lunch': [1],  # Example encoded value
    'test_preparation': [1],  # Example encoded value
    'hours_req_per_unit': [5],  # Example hours required to complete a unit
    'hours_of_sleep': [5],  # Example hours of sleep
    'lastsemcgpa': [90],  # Example last semester CGPA
    'cgpabefore': [88]  # Example previous CGPA
}

# Create DataFrame from input data
input_df = pd.DataFrame(example_input)

# Predict CGPA using Ridge Regression
predicted_cgpa = ridge_model.predict(input_df)

# Display the predicted CGPA
print("Predicted CGPA based on input:", predicted_cgpa[0])


Predicted CGPA based on input: 65.97320298561789
