# Title: Student Attitude and Behavior Analysis

## Introduction:

### The "Student Attitude and Behavior Analysis" project explores the factors influencing student performance and behavior in academic settings. This analysis delves into various aspects such as demographics, academic performance, study habits, and social behavior to understand their impact on students' attitudes and behaviors towards education.

## 1.Loading the Data:


In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv("Student Attitude and Behavior.csv")

data.head()

Unnamed: 0,Certification Course,Gender,Department,Height(CM),Weight(KG),10th Mark,12th Mark,college mark,hobbies,daily studing time,prefer to study in,salary expectation,Do you like your degree?,willingness to pursue a career based on their degree,social medai & video,Travelling Time,Stress Level,Financial Status,part-time job
0,No,Male,BCA,100.0,58.0,79.0,64.0,80.0,Video Games,0 - 30 minute,Morning,40000,No,50%,1.30 - 2 hour,30 - 60 minutes,Bad,Bad,No
1,No,Female,BCA,90.0,40.0,70.0,80.0,70.0,Cinema,30 - 60 minute,Morning,15000,Yes,75%,1 - 1.30 hour,0 - 30 minutes,Bad,Bad,No
2,Yes,Male,BCA,159.0,78.0,71.0,61.0,55.0,Cinema,1 - 2 Hour,Anytime,13000,Yes,50%,More than 2 hour,30 - 60 minutes,Awful,Bad,No
3,Yes,Female,BCA,147.0,20.0,70.0,59.0,58.0,Reading books,1 - 2 Hour,Anytime,1500000,No,50%,1.30 - 2 hour,0 - 30 minutes,Bad,good,No
4,No,Male,BCA,170.0,54.0,40.0,65.0,30.0,Video Games,30 - 60 minute,Morning,50000,Yes,25%,1.30 - 2 hour,30 - 60 minutes,Good,good,No


In [4]:
data.columns

Index(['Certification Course', 'Gender', 'Department', 'Height(CM)',
       'Weight(KG)', '10th Mark', '12th Mark', 'college mark', 'hobbies',
       'daily studing time', 'prefer to study in', 'salary expectation',
       'Do you like your degree?',
       'willingness to pursue a career based on their degree  ',
       'social medai & video', 'Travelling Time ', 'Stress Level ',
       'Financial Status', 'part-time job'],
      dtype='object')

# 2.Preprocessing 

## Handling Missing Values (Imputation):

In [5]:
from sklearn.impute import SimpleImputer

numerical_cols = ['Height(CM)', 'Weight(KG)', '10th Mark', '12th Mark', 'college mark', 'salary expectation']
categorical_cols = ['Certification Course', 'Gender', 'Department', 'hobbies', 'daily studing time',
                    'prefer to study in', 'Do you like your degree?',
                    'willingness to pursue a career based on their degree  ',
                    'social medai & video', 'Travelling Time ', 'Stress Level ', 'Financial Status', 'part-time job']

imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer.fit_transform(data[categorical_cols])

data.head()


Unnamed: 0,Certification Course,Gender,Department,Height(CM),Weight(KG),10th Mark,12th Mark,college mark,hobbies,daily studing time,prefer to study in,salary expectation,Do you like your degree?,willingness to pursue a career based on their degree,social medai & video,Travelling Time,Stress Level,Financial Status,part-time job
0,No,Male,BCA,100.0,58.0,79.0,64.0,80.0,Video Games,0 - 30 minute,Morning,40000.0,No,50%,1.30 - 2 hour,30 - 60 minutes,Bad,Bad,No
1,No,Female,BCA,90.0,40.0,70.0,80.0,70.0,Cinema,30 - 60 minute,Morning,15000.0,Yes,75%,1 - 1.30 hour,0 - 30 minutes,Bad,Bad,No
2,Yes,Male,BCA,159.0,78.0,71.0,61.0,55.0,Cinema,1 - 2 Hour,Anytime,13000.0,Yes,50%,More than 2 hour,30 - 60 minutes,Awful,Bad,No
3,Yes,Female,BCA,147.0,20.0,70.0,59.0,58.0,Reading books,1 - 2 Hour,Anytime,1500000.0,No,50%,1.30 - 2 hour,0 - 30 minutes,Bad,good,No
4,No,Male,BCA,170.0,54.0,40.0,65.0,30.0,Video Games,30 - 60 minute,Morning,50000.0,Yes,25%,1.30 - 2 hour,30 - 60 minutes,Good,good,No


## One hot Encoding

In [27]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


data = pd.read_csv("Student Attitude and Behavior.csv")

categorical_cols = ['Certification Course', 'Gender', 'Department', 'hobbies', 'daily studing time',
                    'prefer to study in', 'Do you like your degree?',
                    'willingness to pursue a career based on their degree  ',
                    'social medai & video', 'Travelling Time ', 'Stress Level ', 'Financial Status', 'part-time job']


one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')


one_hot_encoded_data = one_hot_encoder.fit_transform(data[categorical_cols])


one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_cols))

print("One-hot encoded data:")
print(one_hot_encoded_df.head())


One-hot encoded data:
   Certification Course_No  Certification Course_Yes  Gender_Female  \
0                      1.0                       0.0            0.0   
1                      1.0                       0.0            1.0   
2                      0.0                       1.0            0.0   
3                      0.0                       1.0            1.0   
4                      1.0                       0.0            0.0   

   Gender_Male  Department_B.com Accounting and Finance   \
0          1.0                                       0.0   
1          0.0                                       0.0   
2          1.0                                       0.0   
3          0.0                                       0.0   
4          1.0                                       0.0   

   Department_B.com ISM  Department_BCA  Department_Commerce  hobbies_Cinema  \
0                   0.0             1.0                  0.0             0.0   
1                   0.0       

## Label Encoding

In [7]:


label_encoder = LabelEncoder()

# Fit and transform each categorical column
label_encoded_data = data[categorical_cols].apply(label_encoder.fit_transform)

# Display the first few rows of the label encoded data
print("\nLabel encoded data:")
print(label_encoded_data.head())



Label encoded data:
   Certification Course  Gender  Department  hobbies  daily studing time  \
0                     0       1           2        3                   0   
1                     0       0           2        0                   4   
2                     1       1           2        0                   1   
3                     1       0           2        1                   1   
4                     0       1           2        3                   4   

   prefer to study in  Do you like your degree?  \
0                   1                         0   
1                   1                         1   
2                   0                         1   
3                   0                         0   
4                   1                         1   

   willingness to pursue a career based on their degree    \
0                                                  3        
1                                                  4        
2                               

## Scaling

### Standardization

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

data = pd.read_csv("Student Attitude and Behavior.csv")

numerical_cols = ['Height(CM)', 'Weight(KG)', '10th Mark', '12th Mark', 'college mark', 'salary expectation']

scaler_standard = StandardScaler()
standardized_data = scaler_standard.fit_transform(data[numerical_cols])
standardized_df = pd.DataFrame(standardized_data, columns=numerical_cols)
print(standardized_df.head())




   Height(CM)  Weight(KG)  10th Mark  12th Mark  college mark  \
0   -2.674221   -0.188631   0.165248  -0.434379      0.595099   
1   -3.140096   -1.399601  -0.526009   1.020865     -0.042090   
2    0.074441    1.156891  -0.449202  -0.707237     -0.997872   
3   -0.484609   -2.745123  -0.526009  -0.889142     -0.806716   
4    0.586903   -0.457735  -2.830197  -0.343426     -2.590843   

   salary expectation  
0            0.067685  
1           -0.157383  
2           -0.175388  
3           13.211657  
4            0.157713  


### MIN-MAX Scaling

In [10]:
scaler_minmax = MinMaxScaler()
minmax_scaled_data = scaler_minmax.fit_transform(data[numerical_cols])
minmax_scaled_df = pd.DataFrame(minmax_scaled_data, columns=numerical_cols)
print(minmax_scaled_df.head())

   Height(CM)  Weight(KG)  10th Mark  12th Mark  college mark  \
0    0.509333    0.441860   0.790287   0.387755      0.797980   
1    0.456000    0.232558   0.690949   0.714286      0.696970   
2    0.824000    0.674419   0.701987   0.326531      0.545455   
3    0.760000    0.000000   0.690949   0.285714      0.575758   
4    0.882667    0.395349   0.359823   0.408163      0.292929   

   salary expectation  
0            0.026667  
1            0.010000  
2            0.008667  
3            1.000000  
4            0.033333  


## Splitting

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("Student Attitude and Behavior.csv")

# Define features and target variable
X = data.drop(columns=['salary expectation'])  # Features
y = data['salary expectation']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (188, 18) (188,)
Testing set shape: (47, 18) (47,)


# 3.Training The ML Models

## Linear Regression

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error


data = pd.read_csv("Student Attitude and Behavior.csv")

X = data.drop(columns=['salary expectation']) 
y = data['salary expectation']  
X_train, X_test, y_train, y_test = train_test_sp
lit(X, y, test_size=0.2, random_state=42)


categorical_cols = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


numerical_cols = X.select_dtypes(include=['int', 'float']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Linear Regression
print("Linear Regression:")
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)

# Calculate Mean Squared Error
mse_lr = mean_squared_error(y_test, y_pred_lr)
print("Mean Squared Error:", mse_lr)


Linear Regression:
Mean Squared Error: 4345966265.676295


## Decision Tree Regressor

In [17]:



print("Decision Tree Regressor:")
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

dt_pipeline.fit(X_train, y_train)

y_pred_dt = dt_pipeline.predict(X_test)

mse_dt = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error:", mse_dt)


Decision Tree Regressor:
Mean Squared Error: 346062787.63829786


## SVR

In [None]:
svr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

svr_pipeline.fit(X_train, y_train)

y_pred_svr = svr_pipeline.predict(X_test)

# Calculate Mean Squared Error
mse_svr = mean_squared_error(y_test, y_pred_svr)
print("Mean Squared Error:", mse_svr)

## # K-Nearest Neighbors (KNN)


In [19]:
 print("\nK-Nearest Neighbors (KNN):")
 knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

 knn_pipeline.fit(X_train, y_train)

 y_pred_knn = knn_pipeline.predict(X_test)

# Calculate Mean Squared Error
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("Mean Squared Error:", mse_knn)


K-Nearest Neighbors (KNN):
Mean Squared Error: 1031663550.6417022


# 4.Hyperparam Optimization

## Support Vector Machine (SVR) with GridSearchCV:

In [28]:
from sklearn.model_selection import GridSearchCV

svr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

In [21]:
param_grid_svr = {
    'regressor__C': [0.1, 1, 10, 100],
    'regressor__epsilon': [0.01, 0.1, 0.5, 1],
    'regressor__kernel': ['linear', 'rbf', 'poly'],
}



In [22]:
grid_search_svr = GridSearchCV(svr_pipeline, param_grid=param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_search_svr.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         'passthrough',
                                                                         Index(['Height(CM)', 'Weight(KG)', '10th Mark', '12th Mark', 'college mark'], dtype='object')),
                                                                        ('cat',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Index(['Certification Course', 'Gender', 'Department', 'hobbies',
       'daily studing t...u like your degree?',
       'willingness to pursue a career based on their degree  ',
       'social medai & 

In [23]:
print("Best parameters for SVR:", grid_search_svr.best_params_)
print("Best score for SVR:", -grid_search_svr.best_score_)

Best parameters for SVR: {'regressor__C': 0.1, 'regressor__epsilon': 0.01, 'regressor__kernel': 'rbf'}
Best score for SVR: 15506133756.023489


## Random Forest Regressor with RandomizedSearchCV:


In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

param_dist_rf = {
    'regressor__n_estimators': randint(100, 1000),
    'regressor__max_features': ['auto', 'sqrt'],
    'regressor__max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}

In [25]:
random_search_rf = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist_rf, n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               'passthrough',
                                                                               Index(['Height(CM)', 'Weight(KG)', '10th Mark', '12th Mark', 'college mark'], dtype='object')),
                                                                              ('cat',
                                                                               Pipeline(steps=[('onehot',
                                                                                                OneHotEncoder(handle_unknown='ignore'))]),
                                                                               Index(['Certification Course', 'Gender', 'Department', 'hobbies',
       'daily st...
                   param_distributions={'regressor__bootstrap'

In [26]:
print("Best parameters for Random Forest:", random_search_rf.best_params_)
print("Best score for Random Forest:", -random_search_rf.best_score_)

Best parameters for Random Forest: {'regressor__bootstrap': True, 'regressor__max_depth': 30, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 500}
Best score for Random Forest: 15451359060.431616
