<a href="https://colab.research.google.com/github/Kengo-Akechi/Carbon-Emission/blob/main/ML_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Uploading the Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
file_path = '/content/drive/MyDrive/fossilfund_dataset.csv'
df = pd.read_csv(file_path)


In [None]:
df.head()

Unnamed: 0,Fund profile: Shareclass name,Fund profile: Ticker,Fund profile: Fund name,Fund profile: Asset manager,Fund profile: Shareclass type,Fund profile: Shareclass inception date,Fund profile: Category group,Fund profile: Sustainability mandate,Fund profile: US-SIF member,Fund profile: Oldest shareclass inception date,...,"Prison Free Funds: Border industry, higher risk, weight","Prison Free Funds: Border industry, higher risk, asset","Prison Free Funds: Private prison operators, count","Prison Free Funds: Private prison operators, weight","Prison Free Funds: Private prison operators, asset",Gender Equality Funds: Gender equality score - Overall score (out of 100 points),Gender Equality Funds: Gender equality score - Gender balance in leadership and workforce (out of 40 points),Gender Equality Funds: Gender equality score - Equal compensation and work life balance (out of 30 points),Gender Equality Funds: Gender equality score - Policies promoting gender equality (out of 20 points),"Gender Equality Funds: Gender equality score - Commitment, transparency, and accountability (out of 10 points)"
0,1290 SmartBeta Equity A,TNBRX,1290 SmartBeta Equity Fund,1290 Funds,Open-end mutual fund,2014-11-12,International Equity Funds,Y,,2014-11-12,...,,,,,,,,,,
1,1290 SmartBeta Equity I,TNBRX,1290 SmartBeta Equity Fund,1290 Funds,Open-end mutual fund,2014-11-12,International Equity Funds,Y,,2014-11-12,...,,,,,,,,,,
2,1290 SmartBeta Equity R,TNBRX,1290 SmartBeta Equity Fund,1290 Funds,Open-end mutual fund,2014-11-12,International Equity Funds,Y,,2014-11-12,...,,,,,,,,,,
3,1290 SmartBeta Equity T,TNBRX,1290 SmartBeta Equity Fund,1290 Funds,Open-end mutual fund,2014-11-12,International Equity Funds,Y,,2014-11-12,...,,,,,,,,,,
4,13D Activist A,DDDCX,13D Activist Fund,13D Activist Fund,Open-end mutual fund,2011-12-28,U.S. Equity Fund,Y,,2011-12-28,...,,,,,,,,,,


**Importing libraries**

In [None]:
#importing dependencies
import numpy as np
import re

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
df.shape


(110510, 121)

**PREPROCESSING**

In [None]:
# List of columns to keep
columns_to_keep = [
    'Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)',
    'Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)',
    'Fossil Free Funds: Carbon footprint portfolio coverage by market value weight',
    'Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles',
    'Fossil Free Funds: Fossil fuel holdings, count',
    'Fossil Free Funds: Fossil fuel holdings, weight',
    'Deforestation Free Funds: Deforestation-risk producer, count',
    'Deforestation Free Funds: Deforestation-risk producer, weight',
    'Gender Equality Funds: Gender equality score (out of 100 points)',
    'Gender Equality Funds: Gender equality score, gender balance (out of 100 points)',
    'Gender Equality Funds: Gender equality score, gender policies (out of 100 points)',
    'Fund profile: Shareclass type',
    'Fund profile: Fund net assets',
    'Fund profile: Percent rated',
    'Fund profile: Shareclass inception date',
    'Fossil Free Funds: Fossil fuel grade',
    'Deforestation Free Funds: Deforestation grade',
    'Gender Equality Funds: Gender equality grade',
    'Fund profile: Asset manager',
    'Fund profile: Category group',
    'Fund profile: Sustainability mandate',
    'Fund profile: US-SIF member'
]

# Keeping only the selected columns
df_subset = df[columns_to_keep]

# Save the subset to a new CSV file or use it for further analysis
df_subset.to_csv('subset_dataset.csv', index=False)

In [None]:
print(df_subset.dtypes)

Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)             float64
Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)              float64
Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)                          float64
Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)                      float64
Fossil Free Funds: Carbon footprint portfolio coverage by market value weight            float64
Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles    float64
Fossil Free Funds: Fossil fuel holdings, count                                             int64
Fossil Free Funds: Fossil fuel holdings, weight                                          float64
Deforestation Free Funds: Deforestation-risk producer, count                               int64
Deforestation Free Funds: Deforestation-risk producer, weight                            float64
Gender Equality Funds: Gender 

**Handling Missing values**


In [None]:
# Number of Nan values columnwise
for col in df_subset.columns:
    nan_count = df_subset[col].isna().sum()
    print(f"Number of NaN values in '{col}': {nan_count}")

Number of NaN values in 'Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)': 0
Number of NaN values in 'Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)': 0
Number of NaN values in 'Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)': 0
Number of NaN values in 'Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)': 0
Number of NaN values in 'Fossil Free Funds: Carbon footprint portfolio coverage by market value weight': 0
Number of NaN values in 'Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles': 0
Number of NaN values in 'Fossil Free Funds: Fossil fuel holdings, count': 0
Number of NaN values in 'Fossil Free Funds: Fossil fuel holdings, weight': 0
Number of NaN values in 'Deforestation Free Funds: Deforestation-risk producer, count': 0
Number of NaN values in 'Deforestation Free Funds: Deforestation-risk producer, weight': 0
Number of NaN values in 'Gender Equality Fu

In [None]:
df_subset.drop(columns=['Fund profile: US-SIF member'], inplace=True)
df_subset.drop(columns=['Fund profile: Sustainability mandate'], inplace=True)

KeyError: "['Fund profile: US-SIF member'] not found in axis"

In [None]:
df['Gender Equality Funds: Gender equality grade'].fillna('Unknown', inplace=True)

In [None]:
# Replace NaN values with mean value
mean_value = df_subset['Gender Equality Funds: Gender equality score (out of 100 points)'].mean()
df_subset['Gender Equality Funds: Gender equality score (out of 100 points)'].fillna(mean_value, inplace=True)


In [None]:
# Replace NaN values with mean value
mean_value = df_subset['Gender Equality Funds: Gender equality score, gender balance (out of 100 points)'].mean()
df_subset['Gender Equality Funds: Gender equality score, gender balance (out of 100 points)'].fillna(mean_value, inplace=True)


In [None]:
# Replace NaN values with mean value
mean_value = df_subset['Gender Equality Funds: Gender equality score, gender policies (out of 100 points)'].mean()
df_subset['Gender Equality Funds: Gender equality score, gender policies (out of 100 points)'].fillna(mean_value, inplace=True)

In [None]:
column_with_nan ='Fossil Free Funds: Fossil fuel grade'
df_subset = df_subset.dropna(subset=[column_with_nan])

**Making target and features**


In [None]:
columns = [
    'Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)',
    'Fossil Free Funds: Carbon footprint portfolio coverage by market value weight',
    'Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles',
    'Fossil Free Funds: Fossil fuel holdings, count',
    'Fossil Free Funds: Fossil fuel holdings, weight',
    'Deforestation Free Funds: Deforestation-risk producer, count',
    'Deforestation Free Funds: Deforestation-risk producer, weight',
    'Fund profile: Shareclass type',
    'Fund profile: Fund net assets',
    'Fund profile: Percent rated',
    'Fund profile: Shareclass inception date',
    'Deforestation Free Funds: Deforestation grade',
    'Gender Equality Funds: Gender equality grade',
    'Fund profile: Asset manager',
    'Fund profile: Category group',
    'Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)',
    'Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)'
]

# Selecting features (inputs)
features = df_subset[columns[:-8]]  # Selecting all columns except the last three, which are potential targets

# Selecting target variable (output)
target = df_subset['Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)']  # Choose appropriate target column

In [None]:
features.head()

Unnamed: 0,Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue),Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e),Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e),Fossil Free Funds: Carbon footprint portfolio coverage by market value weight,Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles,"Fossil Free Funds: Fossil fuel holdings, count","Fossil Free Funds: Fossil fuel holdings, weight","Deforestation Free Funds: Deforestation-risk producer, count","Deforestation Free Funds: Deforestation-risk producer, weight",Fund profile: Shareclass type,Fund profile: Fund net assets,Fund profile: Percent rated
0,104.21,1846.27,5965.91,0.981044,0.84456,29,0.058496,5,0.005136,Open-end mutual fund,32236053,0.981064
1,104.21,1846.27,5965.91,0.981044,0.84456,29,0.058496,5,0.005136,Open-end mutual fund,32236053,0.981064
2,104.21,1846.27,5965.91,0.981044,0.84456,29,0.058496,5,0.005136,Open-end mutual fund,32236053,0.981064
3,104.21,1846.27,5965.91,0.981044,0.84456,29,0.058496,5,0.005136,Open-end mutual fund,32236053,0.981064
4,63.08,25580.25,104767.63,0.806359,0.333333,1,0.037922,0,0.0,Open-end mutual fund,346536813,0.80636


In [None]:
# Check data types of columns
print(features.dtypes)

Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)              float64
Fossil Free Funds: Total financed emissions scope 1 + 2 (tCO2e)                          float64
Fossil Free Funds: Total financed emissions scope 1 + 2 + 3 (tCO2e)                      float64
Fossil Free Funds: Carbon footprint portfolio coverage by market value weight            float64
Fossil Free Funds: Carbon footprint portfolio coverage by number of disclosing titles    float64
Fossil Free Funds: Fossil fuel holdings, count                                             int64
Fossil Free Funds: Fossil fuel holdings, weight                                          float64
Deforestation Free Funds: Deforestation-risk producer, count                               int64
Deforestation Free Funds: Deforestation-risk producer, weight                            float64
Fund profile: Fund net assets                                                              int64
Fund profile: Percent rated   

In [None]:
del_cols = ['Fund profile: Shareclass type']
features = features.drop(columns=del_cols, axis=1)


In [None]:
'Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested)' in df_subset.columns
'Fossil Free Funds: Relative carbon intensity (tonnes CO2 / $1M USD revenue)' in df_subset.columns

True

In [None]:
target.head()

0    56.19
1    56.19
2    56.19
3    56.19
4    59.52
Name: Fossil Free Funds: Relative carbon footprint (tonnes CO2 / $1M USD invested), dtype: float64

**SPLITTING DATASET**

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

***TRAINING MODELS***

LINEAR REGRESSION MODEL

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 4544.854191530387
R-squared: 0.7685474913724896


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate R-squared (R2) score using model.score
r2 = model.score(X_test, y_test)

print(f'R-squared (R2) Score using model.score: {r2}')


R-squared (R2) Score using model.score: 0.7685474913724896


In [None]:
features.shape

(110505, 11)

**Using Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Initialize Random Forest Regressor with desired hyperparameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the training data
rf_model.fit(X_train, y_train)


In [None]:
# Make predictions on test set using the trained model
rf_y_pred = rf_model.predict(X_test)

# Calculate R-squared (R2) score for Random Forest model
rf_r2 = rf_model.score(X_test, y_test)

print(f'R-squared (R2) Score for Random Forest Model: {rf_r2}')


R-squared (R2) Score for Random Forest Model: 0.9922454533299528


**Analysis of models**

Linear Regression


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Assuming 'features' and 'target' are your feature and target variables

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, features, target, cv=5)  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", np.mean(cv_scores))

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)


Cross-Validation Scores: [0.75335639 0.7641953  0.76511516 0.78857379 0.77647992]
Mean Cross-Validation Score: 0.7695441120120703
Mean Squared Error (MSE): 4544.854191530387
Mean Absolute Error (MAE): 38.578335426459006
Root Mean Squared Error (RMSE): 67.41553375543641


Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Assuming 'features' and 'target' are your feature and target variables

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Adjust hyperparameters as needed
rf_model.fit(X_train, y_train)

# Cross-validation
# cv_scores_rf = cross_val_score(rf_model, features, target, cv=5)  # 5-fold cross-validation
# print("Cross-Validation Scores (Random Forest):", cv_scores_rf)
# print("Mean Cross-Validation Score (Random Forest):", np.mean(cv_scores_rf))

# Make predictions on the test set
predictions_rf = rf_model.predict(X_test)

# Calculate evaluation metrics
mse_rf = mean_squared_error(y_test, predictions_rf)
mae_rf = mean_absolute_error(y_test, predictions_rf)
rmse_rf = np.sqrt(mse_rf)

print("Mean Squared Error (MSE) - Random Forest:", mse_rf)
print("Mean Absolute Error (MAE) - Random Forest:", mae_rf)
print("Root Mean Squared Error (RMSE) - Random Forest:", rmse_rf)


Mean Squared Error (MSE) - Random Forest: 152.2700451413163
Mean Absolute Error (MAE) - Random Forest: 2.593131021220802
Root Mean Squared Error (RMSE) - Random Forest: 12.339774922636


**NN**

In [None]:
!pip install tensorflow




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load your dataset and preprocess if needed
# Assuming 'features' and 'target' are your feature and target variables

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[X_train_scaled.shape[1]]),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer with 1 neuron for regression task
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
predictions = model.predict(X_test_scaled)

# Calculate evaluation metrics
mse_nn = mean_squared_error(y_test, predictions)
mae_nn = mean_absolute_error(y_test, predictions)
rmse_nn = np.sqrt(mse_nn)

print("Mean Squared Error (MSE) - Neural Network:", mse_nn)
print("Mean Absolute Error (MAE) - Neural Network:", mae_nn)
print("Root Mean Squared Error (RMSE) - Neural Network:", rmse_nn)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78