In [1]:
import os
import optuna
import sklearn
import warnings
import numpy as np
import pandas as pd
from tqdm import TqdmWarning
warnings.filterwarnings("ignore",category=TqdmWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [3]:
print(df_train.columns)
df_train

Index(['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'exam_score'],
      dtype='object')


Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.300
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.700
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.000
3,3,19,male,b.sc,2.00,49.5,yes,8.3,average,group study,high,moderate,63.900
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,629995,18,female,b.tech,4.86,70.7,yes,4.1,good,mixed,high,moderate,69.500
629996,629996,21,female,ba,7.08,54.4,yes,4.5,average,mixed,low,moderate,78.900
629997,629997,24,male,bca,0.64,44.2,yes,4.3,poor,online videos,low,moderate,19.599
629998,629998,20,male,b.com,1.54,75.1,yes,8.2,average,group study,high,moderate,59.100


In [4]:
col_encode = ['gender', 'course','internet_access', 'sleep_quality', 'study_method','facility_rating','exam_difficulty']
col_scale = ['class_attendance', 'sleep_hours']

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

def train_encode_scale(df,col_encode,col_scale):
    encoders = {}
    scalers = {}
    for col in col_encode:
        encoder = LabelEncoder()
        df['encode_'+col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
        df = df.drop(columns = [col])
    for col in col_scale:
        scaler = StandardScaler()
        df['scale_'+col] = scaler.fit_transform(df[[col]])
        scalers[col] = scaler
        df = df.drop(columns = [col])
    df = df.drop(columns = ['id']).reset_index(drop = True)
    return df, encoders, scalers
def test_process(df_test,col_encode,col_scale,encoders,scalers):
    for col in col_encode:
        encoder = encoders[col]
        df_test['encode_'+col] = encoder.transform(df_test[col])
        df_test = df_test.drop(columns = [col])
    for col in col_scale:
        scaler = scalers[col]
        df_test['scale_'+col] = scaler.transform(df_test[[col]])
        df_test = df_test.drop(columns = [col])
    df_test = df_test.drop(columns = ['id']).reset_index(drop = True)
    return df_test

In [6]:
data, encoders, scalers = train_encode_scale(df_train,col_encode,col_scale)
test_data = test_process(df_test, col_encode, col_scale, encoders, scalers)
x = data.drop(columns= ['exam_score'])
y = data['exam_score']

# RandomForest

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size= 0.3, shuffle = True,random_state = 42)

rfr_model = RandomForestRegressor(
    n_estimators=250,
    max_depth = 30,
    random_state = 42,
    min_samples_split = 10,
    min_samples_leaf = 10,
    n_jobs = -1
)

rfr_model.fit(train_x,train_y)

pred = rfr_model.predict(test_x)
mse = mean_squared_error(test_y, pred)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 79.70771413078828


In [8]:
print(rfr_model.feature_names_in_)
print(rfr_model.feature_importances_)

['age' 'study_hours' 'encode_gender' 'encode_course'
 'encode_internet_access' 'encode_sleep_quality' 'encode_study_method'
 'encode_facility_rating' 'encode_exam_difficulty'
 'scale_class_attendance' 'scale_sleep_hours']
[8.08210755e-03 7.04590229e-01 4.00360895e-03 7.14172479e-03
 2.72855032e-04 4.44390144e-02 3.98070684e-02 2.90424412e-02
 3.77928094e-03 1.25596627e-01 3.32450421e-02]


In [11]:
result = rfr_model.predict(test_data)
answer = pd.DataFrame({'id':df_test['id'],'exam_score':result})
answer.to_csv('submission.csv', index=False)

###### Experiment 1

嘗試用 categorical data 轉換為 Numerical data，進行訓練。

In [1]:
import os
import optuna
import sklearn
import warnings
import numpy as np
import pandas as pd
from tqdm import TqdmWarning
warnings.filterwarnings("ignore",category=TqdmWarning)
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")
categorical_data = ["gender", 
                    "course", 
                    "internet_access", 
                    "sleep_quality", 
                    "study_method", 
                    "facility_rating", 
                    "exam_difficulty" ]
# 將 categorical Data 中，有文字的 label抓出來
df_copy = df_train.copy()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
for label, data in df_copy.items():
    if pd.api.types.is_string_dtype(data):
        print(label)

gender
course
internet_access
sleep_quality
study_method
facility_rating
exam_difficulty


In [10]:
for label, data in df_copy.items():
    if label in categorical_data:
        df_copy[label] = pd.Categorical(data).codes+1

In [11]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                630000 non-null  int64  
 1   age               630000 non-null  int64  
 2   gender            630000 non-null  int8   
 3   course            630000 non-null  int8   
 4   study_hours       630000 non-null  float64
 5   class_attendance  630000 non-null  float64
 6   internet_access   630000 non-null  int8   
 7   sleep_hours       630000 non-null  float64
 8   sleep_quality     630000 non-null  int8   
 9   study_method      630000 non-null  int8   
 10  facility_rating   630000 non-null  int8   
 11  exam_difficulty   630000 non-null  int8   
 12  exam_score        630000 non-null  float64
dtypes: float64(4), int64(2), int8(7)
memory usage: 33.0 MB


In [12]:
x = df_copy.drop(columns= ['exam_score'])
y = df_copy['exam_score']

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size= 0.2, shuffle = True)
# 2. 使用 Lasso 回歸模型
lasso_model = Lasso(alpha=0.1, random_state=42)  # 使用 alpha 控制正則化強度
lasso_model.fit(train_x, train_y)  # 訓練模型
lasso_predictions = lasso_model.predict(test_x)  # 進行預測
lasso_mse = mean_squared_error(test_y, lasso_predictions)  # 計算 MSE

print(f"Lasso MSE: {lasso_mse}")

# 3. 使用 ElasticNet 回歸模型
elasticnet_model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
# alpha: 控制正則化強度；l1_ratio: L1 和 L2 之間的比率
elasticnet_model.fit(train_x, train_y)  # 訓練模型
elasticnet_predictions = elasticnet_model.predict(test_x)  # 進行預測
elasticnet_mse = mean_squared_error(test_y, elasticnet_predictions)  # 計算 MSE

print(f"ElasticNet MSE: {elasticnet_mse}")

Lasso MSE: 98.38581981460109
ElasticNet MSE: 98.44457287741547


In [39]:
for label, data in df_test.items():
    if label in categorical_data:
        df_test[label] = pd.Categorical(data).codes+1
result = lasso_model.predict(df_test)
answer = pd.DataFrame({'id':df_test['id'],'exam_score':result})
answer.to_csv('submission.csv', index=False)