In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Step 1: Data Preprocessing
# Load the dataset into a Pandas DataFrame
df = pd.read_csv(r"C:\Users\Shiva\OneDrive\Desktop\Major\studentfeedback.csv")  # Replace with your dataset file


In [3]:
percentage_of_na = (df.isna().sum() / len(df)) * 100
nan_df = pd.DataFrame({'Column': percentage_of_na.index, 'Percentage NaN': percentage_of_na.values})
print(nan_df)

                                               Column  Percentage NaN
0                                          Student ID          0.0000
1                        Well versed with the subject          0.0000
2          Explains concepts in an understandable way          0.0000
3                                Use of presentations          0.0000
4                 Degree of difficulty of assignments          0.0000
5                             Solves doubts willingly          0.0000
6                           Structuring of the course          0.0000
7   Provides support for students going above and ...          0.0000
8            Course recommendation based on relevance          0.0000
9                                      Overall rating          0.0000
10                                        Unnamed: 10          0.0000
11                                        Unnamed: 11        100.0000
12                                        Unnamed: 12        100.0000
13                  

In [4]:
columns_to_drop = nan_df[nan_df['Percentage NaN'] > 70]['Column'].tolist()

In [5]:
print(columns_to_drop)

['Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18']


In [6]:
# Check for missing values and handle them (if necessary)
#df.dropna(axis = 0,column = ashwani,inplace =True)
df_cleaned = df.drop(columns=columns_to_drop)

In [7]:
df_cleaned

Unnamed: 0,Student ID,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance,Overall rating,Unnamed: 10
0,0,7,4,8,2,1,4,8,3,4.625,0
1,1,9,6,8,7,6,5,3,10,6.750,average
2,2,10,2,7,3,10,4,4,7,5.875,weak
3,3,9,6,5,9,10,5,5,4,6.625,average
4,4,5,8,5,7,10,7,3,1,5.750,weak
...,...,...,...,...,...,...,...,...,...,...,...
996,996,6,5,6,10,8,7,8,4,6.750,average
997,997,9,8,8,6,8,3,3,10,6.875,average
998,998,9,7,5,1,2,2,1,10,4.625,weak
999,999,6,7,8,8,9,9,9,5,7.625,high


In [8]:
# Create a binary target variable (1 for "good," 0 for "needs improvement")
threshold = 5
df_cleaned['target'] = df_cleaned['Overall rating'].apply(lambda x: 1 if x >= threshold else 0)

In [9]:
# Step 2: Feature Selection/Engineering
# Select relevant features
selected_features = df_cleaned[['Well versed with the subject', 'Explains concepts in an understandable way', 'Use of presentations', 'Degree of difficulty of assignments', 'Solves doubts willingly', 'Structuring of the course', 'Provides support for students going above and beyond', 'Course recommendation based on relevance']]

In [10]:
# Step 3: Split Data
X = selected_features
y = df_cleaned['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 4: Choose a Machine Learning Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [12]:
# Step 5: Train the Model
model.fit(X_train, y_train)

In [13]:
# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8855721393034826
              precision    recall  f1-score   support

           0       1.00      0.26      0.41        31
           1       0.88      1.00      0.94       170

    accuracy                           0.89       201
   macro avg       0.94      0.63      0.67       201
weighted avg       0.90      0.89      0.86       201

[[  8  23]
 [  0 170]]


In [14]:
# Step 7: Tune Hyperparameters (optional)
# Depending on the model's performance, you may want to fine-tune hyperparameters to optimize its performance.
# You can use techniques like Grid Search or Random Search to find the best hyperparameters.
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search model
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to train the model
best_model = RandomForestClassifier(random_state=42, **best_params)
best_model.fit(X_train, y_train)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [15]:
# Step 8: Make Predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)  # Predict probabilities (useful for Step 9)


In [16]:
# Step 9: Interpret Results
# You can analyze feature importances to understand which features contribute most to the classification.
feature_importance = best_model.feature_importances_
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance}")


Well versed with the subject: 0.08319125806911185
Explains concepts in an understandable way: 0.1160541698421876
Use of presentations: 0.08036941815384666
Degree of difficulty of assignments: 0.1485222424768592
Solves doubts willingly: 0.12831070034128994
Structuring of the course: 0.13780263659281966
Provides support for students going above and beyond: 0.14993904471215155
Course recommendation based on relevance: 0.1558105298117336


In [17]:
# Step 10: Deployment (if needed)
# To deploy the model, you can save it to a file and create an API or application for predictions.
from joblib import dump

# Save the trained model to a file
dump(best_model, 'course_classification_model.joblib')

['course_classification_model.joblib']

In [18]:
import joblib


In [19]:
loaded_model =joblib.load('course_classification_model.joblib')


In [20]:
prdecision = loaded_model.predict(X_test)

In [21]:
print(prdecision)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1]


In [22]:

df_cleaned

Unnamed: 0,Student ID,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance,Overall rating,Unnamed: 10,target
0,0,7,4,8,2,1,4,8,3,4.625,0,0
1,1,9,6,8,7,6,5,3,10,6.750,average,1
2,2,10,2,7,3,10,4,4,7,5.875,weak,1
3,3,9,6,5,9,10,5,5,4,6.625,average,1
4,4,5,8,5,7,10,7,3,1,5.750,weak,1
...,...,...,...,...,...,...,...,...,...,...,...,...
996,996,6,5,6,10,8,7,8,4,6.750,average,1
997,997,9,8,8,6,8,3,3,10,6.875,average,1
998,998,9,7,5,1,2,2,1,10,4.625,weak,0
999,999,6,7,8,8,9,9,9,5,7.625,high,1


In [23]:
#Start for column selected
testing=df_cleaned.iloc[997:998,:]

In [24]:
#Column print

testing

Unnamed: 0,Student ID,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance,Overall rating,Unnamed: 10,target
997,997,9,8,8,6,8,3,3,10,6.875,average,1


In [25]:
#Selecteed column
selected=['Well versed with the subject',
       'Explains concepts in an understandable way', 'Use of presentations',
       'Degree of difficulty of assignments', 'Solves doubts willingly',
       'Structuring of the course',
       'Provides support for students going above and beyond',
       'Course recommendation based on relevance']

In [26]:
#needed column 
final_testing = testing[selected]

In [27]:
final_testing

Unnamed: 0,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance
997,9,8,8,6,8,3,3,10


In [28]:
#Predicsion start
test3=loaded_model.predict(final_testing)

In [29]:
print(test3)

[1]


In [30]:
#Function for giving good or improvment 
def fun(n):
    if(n == 1):
        return "good"
    else:
        return "Need improvment"

In [31]:
testing=df_cleaned.iloc[996:997,:]

In [32]:
testing

Unnamed: 0,Student ID,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance,Overall rating,Unnamed: 10,target
996,996,6,5,6,10,8,7,8,4,6.75,average,1


In [33]:
final_testing = testing[selected]

In [34]:
final_testing

Unnamed: 0,Well versed with the subject,Explains concepts in an understandable way,Use of presentations,Degree of difficulty of assignments,Solves doubts willingly,Structuring of the course,Provides support for students going above and beyond,Course recommendation based on relevance
996,6,5,6,10,8,7,8,4


In [35]:
test4=loaded_model.predict(final_testing)

In [36]:
print(test4)

[1]
