In [31]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [43]:
df = pd.read_csv('Assignment Datasheet.csv')
df.head()


Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
1,ASEAN (Ember),2001,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
2,ASEAN (Ember),2002,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
3,ASEAN (Ember),2003,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
4,ASEAN (Ember),2004,,,,,,,,,...,0.0,,,,,,0.0,,0.0,


In [44]:
# Step 1: Handle missing data and define feature/target columns
categorical_features = ['country']
numerical_features = ['population', 'gdp', 'biofuel_consumption', 'wind_consumption']

In [45]:
# Target column
target_column = 'wind_share_elec'

In [46]:
# Filter rows with missing target values
df = df[df[target_column].notnull()]


In [47]:
# Separate features and target variable
X = df[categorical_features + numerical_features]
y = df[target_column]


In [49]:
# Step 2: Preprocessing pipeline
# Impute missing values and scale numerical features
# Import Pipeline and SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [50]:
# One-hot encode categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [52]:
# Combine preprocessing steps
# Import ColumnTransformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [53]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
# Step 4: Modeling
# Ridge Regression pipeline
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

In [55]:
# Train Ridge model
ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)


In [57]:
# Evaluate Ridge model
from sklearn.metrics import mean_squared_error, r2_score # Import necessary functions

ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

In [58]:
# K-Nearest Neighbors pipeline
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])


In [59]:
# Train KNN model
knn_pipeline.fit(X_train, y_train)
y_pred_knn = knn_pipeline.predict(X_test)


In [60]:
# Evaluate KNN model
knn_mse = mean_squared_error(y_test, y_pred_knn)
knn_r2 = r2_score(y_test, y_pred_knn)


In [61]:
# Print results
print("Ridge Regression Results:")
print(f"MSE: {ridge_mse:.4f}, R2: {ridge_r2:.4f}")

print("\nK-Nearest Neighbors Results:")
print(f"MSE: {knn_mse:.4f}, R2: {knn_r2:.4f}")


Ridge Regression Results:
MSE: 12.1092, R2: 0.3591

K-Nearest Neighbors Results:
MSE: 2.8344, R2: 0.8500
