In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("/content/solarpowergeneration.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/solarpowergeneration.csv'

In [None]:
data

In [None]:
data.shape

In [None]:
data.info()

# **Univariate Analysis**

In [None]:
def univariate_analysis(data, column):

    print(f"Univariate Analysis for {column}:")
    print("-" * 30)

    # Descriptive statistics
    print("Descriptive Statistics:")
    print(data[column].describe())

    # Histogram
    plt.figure()
    plt.hist(data[column], bins=20)
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.title(f"Histogram of {column}")
    plt.show()

    # Kernel Density Estimation (KDE) plot
    plt.figure()
    sns.kdeplot(data[column])
    plt.xlabel(column)
    plt.ylabel("Density")
    plt.title(f"KDE Plot of {column}")
    plt.show()

    # Skewness
    print(f"Skewness: {data[column].skew()}")

    # Box plot
    plt.figure()
    sns.boxplot(data[column])
    plt.xlabel(column)
    plt.title(f"Box Plot of {column}")
    plt.show()

    print("\n")


for column in data.columns:
    univariate_analysis(data, column)

In [None]:
data.kurtosis()

In [None]:
import matplotlib.pyplot as plt
plt.pie(data['sky-cover'].value_counts(), labels=data['sky-cover'].value_counts().index, autopct='%1.1f%%')
plt.title('Pie Chart of Sky Cover Distribution')
plt.show()


**As we can see thier is skewness in some of the features**

we will perform transformation techniques to alter skewness

#**Treating the missing value**

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
data["average-wind-speed-(period)"] = imputer.fit_transform(data[["average-wind-speed-(period)"]])

data = pd.DataFrame(data)

In [None]:
data.isnull().sum()

# **Bivariate Analysis of features before log transformation**

In [None]:
def bivariate_analysis(data, column, target_column, hue_column):

    print(f"Bivariate Analysis of {column} with {target_column}, hue={hue_column}")
    print("-" * 50)

    # Scatter plot with hue
    plt.figure()
    sns.scatterplot(data=data, x=column, y=target_column, hue=hue_column)
    plt.xlabel(column)
    plt.ylabel(target_column)
    plt.title(f"Scatter Plot of {column} vs. {target_column} (Hue: {hue_column})")
    plt.show()

    # Correlation coefficient
    correlation = data[column].corr(data[target_column])
    print(f"Correlation Coefficient: {correlation}")

    print("\n")

# Example usage with 'sky-cover' as hue
for column in data.columns:
     if column not in ['power-generated', 'sky-cover']:
         bivariate_analysis(data, column, 'power-generated', 'sky-cover')

In [None]:
#scatter plot bettween sky-cover and power-generated

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x="sky-cover", y="power-generated", hue="power-generated")
plt.xlabel("Sky Cover")
plt.ylabel("Power Generated")
plt.title("scatter plot of Sky Cover vs. Power Generated")
plt.show()

**KDE plot of sky cover and power generated before log transformation**

In [None]:
plt.figure(figsize=(10, 6))
for sky_cover in [1, 2, 3, 4]:

    subset = data[data["sky-cover"] == sky_cover]["power-generated"]
    sns.kdeplot(subset, label=f"Sky Cover {sky_cover}")

plt.xlabel("Power Generated")
plt.ylabel("Density")
plt.title("KDE Plot of Power Generated for Different Sky Cover Categories")
plt.legend()
plt.show()

#**Corr matrix and heatmap before log transformation**

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot = True , cmap = "coolwarm")
plt.show()

# **Boxplot and outliers detection before log transformation to check outliers**

In [None]:
sns.boxplot(data, orient = "h")

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()

print(outliers)

# **Log Transformation**

In [None]:
# Apply log transformation
data['log_power_generated'] = np.log(data['power-generated'] + 1)

# Check the distribution of the transformed column
print(data['log_power_generated'].skew())

In [None]:
data = data.drop("power-generated", axis=1)

In [None]:
data.skew()

# **Bivariate Analysis of features after log transformation**

In [None]:
def bivariate_analysis(data, column, target_column, hue_column):

    print(f"Bivariate Analysis of {column} with {target_column}, hue={hue_column}")
    print("-" * 50)

    # Scatter plot with hue
    plt.figure()
    sns.scatterplot(data=data, x=column, y=target_column, hue=hue_column)
    plt.xlabel(column)
    plt.ylabel(target_column)
    plt.title(f"Scatter Plot of {column} vs. {target_column} (Hue: {hue_column})")
    plt.show()

    # Correlation coefficient
    correlation = data[column].corr(data[target_column])
    print(f"Correlation Coefficient: {correlation}")

    print("\n")

# Example usage with 'sky-cover' as hue
for column in data.columns:
    if column not in ['log_power_generated', 'sky-cover']:
        bivariate_analysis(data, column, 'log_power_generated', 'sky-cover')

In [None]:
#scatter plot bettween sky-cover and power-generated

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x="sky-cover", y="log_power_generated", hue="log_power_generated")
plt.xlabel("Sky Cover")
plt.ylabel("Power Generated")
plt.title("scatter plot of Sky Cover vs. Power Generated")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
for sky_cover in [1, 2, 3, 4]:

    subset = data[data["sky-cover"] == sky_cover]["log_power_generated"]
    sns.kdeplot(subset, label=f"Sky Cover {sky_cover}")

plt.xlabel("log Power Generated")
plt.ylabel("Density")
plt.title("KDE Plot of Power Generated for Different Sky Cover Categories")
plt.legend()
plt.show()

#**Corr matrix and heatmap after log transformation**

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot = True , cmap = "coolwarm")
plt.show()

# **Boxplot and outliers detection after log transformation to check outliers**

In [None]:
sns.boxplot(data, orient="h")

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()

print(outliers)

# **Z-score**
lets find the z-score of each feature that have outliers, to know whether we need to remove outliers or not.

**Conclusion**

*   Generally the thresholds for outliers is between -2 and 2.
*   In our dataset outliers present in our features lies between the range of thresholds, so we do not need to transform or remove the outliers.

In [None]:
#wind-direction
x = data['wind-direction'][0]
mean = data['wind-direction'].mean()
std = data['wind-direction'].std()

z = (x - mean) / std
print(z)

#wind-speed
x = data['wind-speed'][0]
mean = data['wind-speed'].mean()
std = data['wind-speed'].std()

z = (x - mean) / std
print(z)

#visibility
x = data['visibility'][0]
mean = data['visibility'].mean()
std = data['visibility'].std()

z = (x - mean) / std
print(z)

#humidity
x = data['humidity'][0]
mean = data['humidity'].mean()
std = data['humidity'].std()

z = (x - mean) / std
print(z)

#average-wind-speed-(period)
x = data['average-wind-speed-(period)'][0]
mean = data['average-wind-speed-(period)'].mean()
std = data['average-wind-speed-(period)'].std()

z = (x - mean) / std
print(z)

#average-pressure-(period)
x = data['average-pressure-(period)'][0]
mean = data['average-pressure-(period)'].mean()
std = data['average-pressure-(period)'].std()

z = (x - mean) / std
print(z)

#log-power-generated
x = data['log_power_generated'][0]
mean = data['log_power_generated'].mean()
std = data['log_power_generated'].std()

z = (x - mean) / std
print(z)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data_clean):
    vif_data_clean = pd.DataFrame()
    vif_data_clean["feature"] = data.columns
    vif_data_clean["VIF"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    return vif_data_clean

# Assuming 'x' is your DataFrame with independent variables
vif_df = calculate_vif(data)
print(vif_df)

In [None]:
# prompt: write code to drop outliers from each column exclude sky-cover column

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))

# Drop outliers from all columns except 'sky-cover'
for column in data.columns:
    if column != 'sky-cover':
        data = data[~outliers[column]]

# Reset the index after dropping rows
data = data.reset_index(drop=True)



In [None]:
sns.boxplot(data, orient="h")

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).sum()

print(outliers)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data_clean):
    vif_data_clean = pd.DataFrame()
    vif_data_clean["feature"] = data.columns
    vif_data_clean["VIF"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    return vif_data_clean

# Assuming 'x' is your DataFrame with independent variables
vif_df = calculate_vif(data)
print(vif_df)

In [None]:
data = data.drop("visibility", axis=1)

# **One Hot Encoding to covert categorical column(sky-cover) into numerical column**

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

data_encoded = ohe.fit_transform(data[["sky-cover"]])


data_encoded_df = pd.DataFrame(data_encoded.toarray(), columns=ohe.get_feature_names_out(["sky-cover"]))

data = data.join(data_encoded_df)
data = data.drop("sky-cover", axis=1)

# **Standardization to normalize the whole dataset**

In [None]:
from sklearn.preprocessing import StandardScaler
numerical_cols = ["distance-to-solar-noon", "temperature", "wind-direction",
                  "wind-speed", "humidity",
                  "average-wind-speed-(period)", "average-pressure-(period)",
                  "log_power_generated"]

scaler = StandardScaler()

data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [None]:
data

In [None]:
x = data.drop(columns=["log_power_generated"])
y = data["log_power_generated"]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Example with 100 trees

# Fit the model to the training data
rf_model.fit(x_train, y_train)

# Make predictions on the test data (optional)
# y_pred = rf_model.predict(x_test)

# Get the first tree from the forest
tree = rf_model.estimators_[0]

# Plot the tree
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=x_train.columns, filled=True, rounded=True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(x_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(x_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

In [None]:
# write code for hyperparametertuning of the above done model

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}



In [None]:
# Create a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)




In [None]:
# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)



In [None]:
# Get the best score
best_score = grid_search.best_score_
print("Best score:", best_score)

# Train the model with the best parameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(x_train, y_train)

# Make predictions on the test data
y_pred = best_rf_model.predict(x_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

In [None]:
import pickle
import sklearn

In [None]:
with open ("model.pkl", "wb") as file:
  pickle.dump(best_rf_model, file)