In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
df = pd.read_csv('student-por.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns

df[numeric_columns].hist(bins=15, figsize=(15, 10), layout=(4, 4))
plt.tight_layout()
plt.show()

In [None]:
df = pd.read_csv('student-por.csv')

df = pd.get_dummies(df, drop_first=True)

df.head()

In [None]:
numeric_cols = df.select_dtypes(include=[np.number])

z_scores = (numeric_cols - numeric_cols.mean()) / numeric_cols.std()

threshold = 3

outliers = (z_scores.abs() > threshold)

print("Кількість аномалій у кожному числовому стовпці:")
print(outliers.sum())

In [None]:
X = df.drop('G3', axis=1)
y = df['G3']

print('Original dataset shape %s' % Counter(y))

X_filtered = X[~y.isin([1, 5, 19, 6])]
y_filtered = y[~y.isin([1, 5, 19, 6])]

print(Counter(y_filtered))

smote = SMOTE(sampling_strategy='auto', random_state=42)

X_res, y_res = smote.fit_resample(X_filtered, y_filtered)

print('Resampled dataset shape %s' % Counter(y_res))

resampled_df = pd.DataFrame(X_res, columns=X.columns)
resampled_df['G3'] = y_res

resampled_df.to_csv('student-por-extended.csv', index=False)

In [None]:
df = pd.read_csv('student-por-extended.csv')

df

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(df['G3'], bins=15, edgecolor='k', alpha=0.7)
plt.title('Розподіл цільової змінної G3', fontsize=14)
plt.xlabel('Значення G3', fontsize=12)
plt.ylabel('Кількість', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns

df[numeric_columns].hist(bins=15, figsize=(15, 10), layout=(4, 4))
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=[np.number])

z_scores = (numeric_cols - numeric_cols.mean()) / numeric_cols.std()

threshold = 3

outliers = (z_scores.abs() > threshold)

print("Кількість аномалій у кожному числовому стовпці:")
print(outliers.sum())

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(22, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Кореляційна матриця")
plt.show()

In [None]:
correlation_sums = correlation_matrix.abs().sum(axis=1)

top_features = correlation_sums.sort_values(ascending=False).head(10).index

print("Топ характеристики за кореляцією:", top_features)

In [None]:
selected_features = ['G1', 'G2', 'G3', 'Medu', 'higher_yes', 'Fedu', 'failures', 'studytime', 'internet_yes']
df_selected = df[selected_features]

df_selected.to_csv('selected_features_dataset.csv', index=False)

In [None]:
df = pd.read_csv('selected_features_dataset.csv')

df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Кореляційна матриця")
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df[numeric_columns])
plt.title('Boxplot для числових змінних')
plt.xticks(rotation=90)
plt.show()

In [None]:
X = df.drop(columns=['G3'])
y = df['G3']

In [None]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
variance_selector = VarianceThreshold(threshold=0.01)

original_columns = X.columns

preprocessor = ColumnTransformer(
    transformers=[
        ("variance", variance_selector, num_features),
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ],
    remainder='passthrough'
)

X = preprocessor.fit_transform(X)
print(X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
def plot_actual_vs_fitted(y_actual, y_pred, model_name):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_actual, y=y_pred, alpha=0.6, color='b', edgecolor=None)
    plt.plot([y_actual.min(), y_actual.max()], [y_actual.min(), y_actual.max()], 'r--', lw=2)
    plt.title(f"{model_name} - Actual vs Fitted Values", fontsize=14)
    plt.xlabel("Actual Values", fontsize=12)
    plt.ylabel("Fitted Values", fontsize=12)
    plt.grid(alpha=0.3)
    plt.show()

In [None]:
def plot_metric_comparison(model_list, mae_list, rmse_list, r2_list):
    results_df = pd.DataFrame({
        'Model Name': model_list,
        'MAE': mae_list,
        'RMSE': rmse_list,
        'R2': r2_list
    })

    results_long = results_df.melt(id_vars="Model Name", var_name="Metric", value_name="Score")

    plt.figure(figsize=(12, 8))
    sns.barplot(x="Score", y="Model Name", hue="Metric", data=results_long, palette="coolwarm")
    plt.title("Comparison of Metrics for Different Models", fontsize=14)
    plt.xlabel("Score", fontsize=12)
    plt.ylabel("Model Name", fontsize=12)
    plt.legend(title="Metric", loc='upper right')
    plt.grid(alpha=0.3)
    plt.show()


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []
rmse_list = []
mae_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"{name}")
    model_list.append(name)

    print("Model performance for Training set")
    print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_train_mae:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")

    print('----------------------------------')

    print("Model performance for Test set")
    print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_test_mae:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")

    r2_list.append(model_test_r2)
    mae_list.append(model_test_mae)
    rmse_list.append(model_test_rmse)

    print("="*35)
    print('\n')

    plot_actual_vs_fitted(y_test, y_test_pred, name)

results_df = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score'])
print(results_df.sort_values(by="R2_Score", ascending=False))

plot_metric_comparison(model_list, mae_list, rmse_list, r2_list)




In [None]:
models = {
    "Decision Tree": DecisionTreeRegressor()
}

model_list = []
r2_list = []
rmse_list = []
mae_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"{name}")
    model_list.append(name)

    print("Model performance for Training set")
    print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_train_mae:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")

    print('----------------------------------')

    print("Model performance for Test set")
    print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_test_mae:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")

    r2_list.append(model_test_r2)
    mae_list.append(model_test_mae)
    rmse_list.append(model_test_rmse)

    print("="*35)
    print('\n')

results_df = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score'])
print(results_df.sort_values(by="R2_Score", ascending=False))

plt.figure(figsize=(12, 6))

# Реальні vs передбачувані значення на тестовій вибірці
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.7, edgecolors='k', color='green')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title("Real vs Predicted")
plt.xlabel("Real Values")
plt.ylabel("Predicted Values")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
models = {
    "Random Forest Regressor": RandomForestRegressor()
}

model_list = []
r2_list = []
rmse_list = []
mae_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"{name}")
    model_list.append(name)

    print("Model performance for Training set")
    print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_train_mae:.4f}")
    print(f"- R2 Score: {model_train_r2:.4f}")

    print('----------------------------------')

    print("Model performance for Test set")
    print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
    print(f"- Mean Absolute Error: {model_test_mae:.4f}")
    print(f"- R2 Score: {model_test_r2:.4f}")

    r2_list.append(model_test_r2)
    mae_list.append(model_test_mae)
    rmse_list.append(model_test_rmse)

    print("="*35)
    print('\n')

results_df = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score'])
print(results_df.sort_values(by="R2_Score", ascending=False))

subset = 20

plt.figure(figsize=(12, 6))
x = np.arange(subset)

plt.bar(x - 0.2, y_test[:subset], width=0.4, label="Real Values", color="blue")

plt.bar(x + 0.2, y_test_pred[:subset], width=0.4, label="Predicted Values", color="orange")

plt.title("Real vs Predicted")
plt.xlabel("Index")
plt.ylabel("Values")
plt.legend()
plt.grid(True)
plt.show()