In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import learning_curve, validation_curve

In [3]:
train_df = pd.read_csv("./databases/train.csv")
test_df =  pd.read_csv("./databases/test.csv")

In [4]:
train_df 

Unnamed: 0,Id,date,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type,Usage_kWh
0,1,1/1/2018 0:15,2.95,0.0,0.00,73.21,100.0,900,Weekday,Monday,Light_Load,3.17
1,2,1/1/2018 0:30,4.46,0.0,0.00,66.77,100.0,1800,Weekday,Monday,Light_Load,4.00
2,3,1/1/2018 0:45,3.28,0.0,0.00,70.28,100.0,2700,Weekday,Monday,Light_Load,3.24
3,4,1/1/2018 1:00,3.56,0.0,0.00,68.09,100.0,3600,Weekday,Monday,Light_Load,3.31
4,5,1/1/2018 1:15,4.50,0.0,0.00,64.72,100.0,4500,Weekday,Monday,Light_Load,3.82
...,...,...,...,...,...,...,...,...,...,...,...,...
27994,27995,19/10/2018 14:45,32.62,0.0,0.02,85.75,100.0,53100,Weekday,Friday,Maximum_Load,54.36
27995,27996,19/10/2018 15:00,35.46,0.0,0.03,84.76,100.0,54000,Weekday,Friday,Maximum_Load,56.63
27996,27997,19/10/2018 15:15,30.92,0.0,0.03,89.63,100.0,54900,Weekday,Friday,Maximum_Load,62.50
27997,27998,19/10/2018 15:30,58.18,0.0,0.04,83.89,100.0,55800,Weekday,Friday,Maximum_Load,89.68


In [None]:
test_df

In [None]:
train_df_null = train_df.isnull()
print(train_df_null.sum())

In [None]:
test_df_null = test_df.isnull()
print(test_df_null.sum())

In [None]:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
train_df['Leading_Current_Reactive_Power_kVarh'] = imputer_mean.fit_transform(train_df[['Leading_Current_Reactive_Power_kVarh']])
train_df['Leading_Current_Power_Factor'] = imputer_mean.fit_transform(train_df[['Leading_Current_Power_Factor']])

In [None]:
def weekStatus(day):
    if day in ['Saturday','Sunday']:
        return 'Weekend'
    else:
        return 'Weekday'
    
train_df ['WeekStatus'] = train_df['Day_of_week'].apply(weekStatus)   
test_df ['WeekStatus'] = test_df['Day_of_week'].apply(weekStatus)   

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%d/%m/%Y %H:%M')
test_df['date'] = pd.to_datetime(test_df['date'], format='%d/%m/%Y %H:%M')

train_df['Day_of_week'] = train_df['date'].dt.day_name()
test_df['Day_of_week'] = test_df['date'].dt.day_name()

In [None]:
train_df_null = train_df.isnull()
print(train_df_null.sum())

In [None]:
test_df_null = test_df.isnull()
print(test_df_null.sum())

In [None]:
train_df

In [None]:
test_df

In [None]:
categorical_features = ['WeekStatus', 'Day_of_week', 'Load_Type']
label_encoder = LabelEncoder()
train_df_label_encoder = train_df.copy()
test_df_label_encoder = test_df.copy()

for feature in categorical_features:
    train_df_label_encoder[feature] = label_encoder.fit_transform(train_df_label_encoder[feature])
    test_df_label_encoder[feature] = label_encoder.transform(test_df_label_encoder[feature])

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df_label_encoder

In [None]:
test_df_label_encoder

In [None]:
train_df_label_encoder['month'] = train_df_label_encoder['date'].dt.month
train_df_label_encoder['day'] = train_df_label_encoder['date'].dt.day
train_df_label_encoder['hour'] = train_df_label_encoder['date'].dt.hour
train_df_label_encoder['minute'] = train_df_label_encoder['date'].dt.minute
train_df_label_encoder = train_df_label_encoder.drop(columns=['date'])
train_df_label_encoder = train_df_label_encoder.drop(columns=['Id'])
train_df_label_encoder = train_df_label_encoder.drop(columns=['Day_of_week'])


test_df_label_encoder['month'] = test_df_label_encoder['date'].dt.month
test_df_label_encoder['day'] = test_df_label_encoder['date'].dt.day
test_df_label_encoder['hour'] = test_df_label_encoder['date'].dt.hour
test_df_label_encoder['minute'] = test_df_label_encoder['date'].dt.minute
test_df_label_encoder = test_df_label_encoder.drop(columns=['date'])
test_df_label_encoder = test_df_label_encoder.drop(columns=['Id'])
test_df_label_encoder = test_df_label_encoder.drop(columns=['Day_of_week'])

In [None]:
train_df_label_encoder

In [None]:
test_df_label_encoder

In [None]:
scaler = StandardScaler()
features_without_target = ['Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh', 
                           'CO2(tCO2)', 'Lagging_Current_Power_Factor','Leading_Current_Power_Factor', 
                           'NSM', 'WeekStatus', 'Load_Type','month', 'day','hour', 'minute']

train_df_scaled = train_df_label_encoder.copy()
test_df_scaled = test_df_label_encoder.copy()

train_df_scaled[features_without_target] = pd.DataFrame(scaler.fit_transform(train_df_label_encoder[features_without_target]))
test_df_scaled[features_without_target] = pd.DataFrame(scaler.transform(test_df_label_encoder[features_without_target]))

In [None]:
train_df_scaled

In [None]:
test_df_scaled

In [None]:
z_scores_train = np.abs(stats.zscore(train_df_scaled[features_without_target]))
outliers = z_scores_train > 3
print(outliers.sum())
outlier_indices = np.where(z_scores_train > 3)
outlier_indices = np.unique(outlier_indices[0])
train_df_scaled = train_df_scaled.drop(index=outlier_indices)

In [None]:
# Define features and target

label = 'Usage_kWh'

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform KFold cross-validation
validation_rmse = []
for fold, (train_index, valid_index) in enumerate(kf.split(train_df_scaled), 1):
    print(f"Fold {fold}")
    
    # Split the data into training and validation sets for this fold
    X_train, X_valid = train_df_scaled.iloc[train_index][features_without_target], train_df_scaled.iloc[valid_index][features_without_target]
    y_train, y_valid = train_df_scaled.iloc[train_index][label], train_df_scaled.iloc[valid_index][label]

    # Initialize and train Linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred_valid = model.predict(X_valid)
    
    # Calculate RMSE (Root Mean Squared Error)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
    print(f"Validation RMSE: {rmse}")
    validation_rmse.append(rmse)

    # Make predictions on the test set (assuming 'test_df_with_id' is your test dataset)
    test_predictions = model.predict(test_df_scaled[features_without_target])
    
    # Create a DataFrame for predictions with 'Id' column from 'test_df_with_id'
    submission_df = pd.DataFrame({'Id': test_df['Id'], 'Usage_kWh_Predicted': test_predictions})
    
    # Save the predictions to a CSV file with IDs
    submission_df.to_csv(f'predictions_fold_{fold}.csv', index=False)

# Optionally, you can calculate the mean validation RMSE
mean_validation_rmse = np.mean(validation_rmse)
print(f"Mean Validation RMSE: {mean_validation_rmse}")


In [None]:
label = 'Usage_kWh'

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform KFold cross-validation
validation_rmse = []
for fold, (train_index, valid_index) in enumerate(kf.split(train_df_scaled), 1):
    print(f"Fold {fold}")
    
    # Split the data into training and validation sets for this fold
    X_train, X_valid = train_df_scaled.iloc[train_index][features_without_target], train_df_scaled.iloc[valid_index][features_without_target]
    y_train, y_valid = train_df_scaled.iloc[train_index][label], train_df_scaled.iloc[valid_index][label]

    # Create a polynomial regression pipeline
    model = make_pipeline(PolynomialFeatures(degree=3), LinearRegression())
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred_valid = model.predict(X_valid)
    
    # Calculate RMSE (Root Mean Squared Error)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
    print(f"Validation RMSE: {rmse}")
    validation_rmse.append(rmse)

    # Make predictions on the test set (assuming 'test_df_with_id' is your test dataset)
    test_predictions = model.predict(test_df_scaled[features_without_target])
    
    # Create a DataFrame for predictions with 'Id' column from 'test_df_with_id'
    submission_df = pd.DataFrame({'Id': test_df['Id'], 'Usage_kWh_Predicted': test_predictions})
    
    # Save the predictions to a CSV file with IDs
    submission_df.to_csv(f'predictions_fold_{fold}.csv', index=False)

# Optionally, you can calculate the mean validation RMSE
mean_validation_rmse = np.mean(validation_rmse)
print(f"Mean Validation RMSE: {mean_validation_rmse}")


In [None]:
# Learning Curves
train_sizes, train_scores, valid_scores = learning_curve(model, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
valid_scores_mean = -np.mean(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, valid_scores_mean, label='Validation error')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.show()

# Validation Curves
degrees = np.arange(1, 10)  # Adjust range as needed
train_scores, valid_scores = validation_curve(model, X_train, y_train, param_name='polynomialfeatures__degree', param_range=degrees, cv=10, scoring='neg_root_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
valid_scores_mean = -np.mean(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(degrees, train_scores_mean, label='Training error')
plt.plot(degrees, valid_scores_mean, label='Validation error')
plt.xlabel('Degree of polynomial features')
plt.ylabel('RMSE')
plt.title('Validation Curves')
plt.legend()
plt.grid(True)
plt.show()
