In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
import sklearn.metrics

# Read data
pollutiondata = pd.read_excel('Delhi.xlsx')
df = pd.DataFrame(pollutiondata)

# Copying the DataFrame
second = df.copy()

# Converting object values to numbers
dist = second['City']
distset = set(dist)
dd = list(distset)
dictofwords = {dd[i]: i for i in range(0, len(dd))}
second['City'] = second['City'].map(dictofwords)

dist = second['AQI_Bucket']
distset = set(dist)
dd = list(distset)
dictofwords = {dd[i]: i for i in range(0, len(dd))}
second['AQI_Bucket'] = second['AQI_Bucket'].map(dictofwords)
second['AQI_Bucket'] = second['AQI_Bucket'].fillna(second['AQI_Bucket'].mean())

# Filling NaN values and dropping columns
second = second.bfill()
second = second.drop('Date', axis=1)
second = second.drop('AQI_Bucket', axis=1)
second.fillna(0, inplace=True)

# # Visualizing data
# fig = px.scatter(second, x='City', y='AQI')
# fig.show()

# # Scatter plots for other features
# for feature in ['PM10', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']:
#     fig = px.scatter(second, x=feature, y='AQI')
#     fig.show()

# Splitting data into features and labels
features = second[['City', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
                   'Benzene', 'Toluene', 'Xylene', 'AQI']]
labels = second['AQI']

# Splitting data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, test_size=0.2, random_state=2)

# Define Decision Tree Regressor
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(set(y)) == 1:
            return np.mean(y)

        num_features = X.shape[1]
        best_feature, best_split_value, best_split_score = None, None, float('inf')

        for feature in range(num_features):
            for split_value in np.unique(X[:, feature]):
                left_indices = X[:, feature] <= split_value
                right_indices = X[:, feature] > split_value

                left_y, right_y = y[left_indices], y[right_indices]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                split_score = self._calculate_split_score(left_y, right_y)

                if split_score < best_split_score:
                    best_feature = feature
                    best_split_value = split_value
                    best_split_score = split_score

        if best_split_score == float('inf'):
            return np.mean(y)

        left_indices = X[:, best_feature] <= best_split_value
        right_indices = X[:, best_feature] > best_split_value

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature': best_feature, 'split_value': best_split_value,
                'left': left_subtree, 'right': right_subtree}

    def _calculate_split_score(self, left_y, right_y):
        return np.var(left_y) * len(left_y) + np.var(right_y) * len(right_y)

    def predict(self, X):
        predictions = []
        for sample in X:
            predictions.append(self._traverse_tree(sample, self.tree))
        return np.array(predictions)

    def _traverse_tree(self, sample, tree):
        if isinstance(tree, dict):
            if sample[tree['feature']] <= tree['split_value']:
                return self._traverse_tree(sample, tree['left'])
            else:
                return self._traverse_tree(sample, tree['right'])
        else:
            return tree

# Fit and evaluate Decision Tree model
dt_regr = DecisionTreeRegressor(max_depth=2)
dt_regr.fit(xtrain.values, ytrain.values)
y_pred_dt = dt_regr.predict(xtest.values)
r2_score_dt = sklearn.metrics.r2_score(ytest, y_pred_dt)
print("Decision Tree R^2 Score:", r2_score_dt)

# Define Random Forest Regressor
class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            sample_indices = np.random.choice(len(X), len(X), replace=True)
            X_sampled, y_sampled = X[sample_indices], y[sample_indices]
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros(len(X))
        for tree in self.trees:
            predictions += tree.predict(X)
        return predictions / self.n_estimators

# Fit and evaluate Random Forest model
rf_regr = RandomForestRegressor(n_estimators=100, max_depth=2, random_state=0)
rf_regr.fit(xtrain.values, ytrain.values)
y_pred_rf = rf_regr.predict(xtest.values)
r2_score_rf = sklearn.metrics.r2_score(ytest, y_pred_rf)
print("Random Forest R^2 Score:", r2_score_rf)

Decision Tree R^2 Score: 0.8854226229423596
Random Forest R^2 Score: 0.9028781459564414


In [7]:
# Calculate Mean Squared Error
mse_dt = sklearn.metrics.mean_squared_error(ytest, y_pred_dt)
mse_rf = sklearn.metrics.mean_squared_error(ytest, y_pred_rf)

# Print R^2 Score and Mean Squared Error
print("Decision Tree R^2 Score:", r2_score_dt)
print("Decision Tree Mean Squared Error:", mse_dt)
print("Random Forest R^2 Score:", r2_score_rf)
print("Random Forest Mean Squared Error:", mse_rf)


Decision Tree R^2 Score: 0.8854226229423596
Decision Tree Mean Squared Error: 1672.6726383655066
Random Forest R^2 Score: 0.9028781459564414
Random Forest Mean Squared Error: 1417.8459310014023


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
import sklearn.metrics

# Read data
pollutiondata = pd.read_excel('Delhi.xlsx')
df = pd.DataFrame(pollutiondata)

# Copying the DataFrame
second = df.copy()

# Converting object values to numbers
dist = second['City']
distset = set(dist)
dd = list(distset)
dictofwords = {dd[i]: i for i in range(0, len(dd))}
second['City'] = second['City'].map(dictofwords)

dist = second['AQI_Bucket']
distset = set(dist)
dd = list(distset)
dictofwords = {dd[i]: i for i in range(0, len(dd))}
second['AQI_Bucket'] = second['AQI_Bucket'].map(dictofwords)
second['AQI_Bucket'] = second['AQI_Bucket'].fillna(second['AQI_Bucket'].mean())

# Filling NaN values and dropping columns
second = second.bfill()
second = second.drop('Date', axis=1)
second = second.drop('AQI_Bucket', axis=1)
second.fillna(0, inplace=True)

# Splitting data into features and labels
features = second[['City', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
                   'Benzene', 'Toluene', 'Xylene', 'AQI']]
labels = second['AQI']

# Splitting data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, test_size=0.2, random_state=2)

# Define Decision Tree Regressor
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(set(y)) == 1:
            return np.mean(y)

        num_features = X.shape[1]
        best_feature, best_split_value, best_split_score = None, None, float('inf')

        for feature in range(num_features):
            for split_value in np.unique(X[:, feature]):
                left_indices = X[:, feature] <= split_value
                right_indices = X[:, feature] > split_value

                left_y, right_y = y[left_indices], y[right_indices]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                split_score = self._calculate_split_score(left_y, right_y)

                if split_score < best_split_score:
                    best_feature = feature
                    best_split_value = split_value
                    best_split_score = split_score

        if best_split_score == float('inf'):
            return np.mean(y)

        left_indices = X[:, best_feature] <= best_split_value
        right_indices = X[:, best_feature] > best_split_value

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature': best_feature, 'split_value': best_split_value,
                'left': left_subtree, 'right': right_subtree}

    def _calculate_split_score(self, left_y, right_y):
        return np.var(left_y) * len(left_y) + np.var(right_y) * len(right_y)

    def predict(self, X):
        predictions = []
        for sample in X:
            predictions.append(self._traverse_tree(sample, self.tree))
        return np.array(predictions)

    def _traverse_tree(self, sample, tree):
        if isinstance(tree, dict):
            if sample[tree['feature']] <= tree['split_value']:
                return self._traverse_tree(sample, tree['left'])
            else:
                return self._traverse_tree(sample, tree['right'])
        else:
            return tree

# Fit and evaluate Decision Tree model
dt_regr = DecisionTreeRegressor(max_depth=2)
dt_regr.fit(xtrain.values, ytrain.values)
y_pred_dt = dt_regr.predict(xtest.values)
r2_score_dt = sklearn.metrics.r2_score(ytest, y_pred_dt)
print("Decision Tree R^2 Score:", r2_score_dt)

# Define Random Forest Regressor
class RandomForestRegressor:
    def __init__(self, n_estimators=50, max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            sample_indices = np.random.choice(len(X), len(X), replace=True)
            X_sampled, y_sampled = X[sample_indices], y[sample_indices]
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros(len(X))
        for tree in self.trees:
            predictions += tree.predict(X)
        return predictions / self.n_estimators

# Fit and evaluate Random Forest model
rf_regr = RandomForestRegressor(n_estimators=50, max_depth=2, random_state=0)
rf_regr.fit(xtrain.values, ytrain.values)
y_pred_rf = rf_regr.predict(xtest.values)
r2_score_rf = sklearn.metrics.r2_score(ytest, y_pred_rf)
print("Random Forest R^2 Score:", r2_score_rf)


Decision Tree R^2 Score: 0.8854226229423596
Random Forest R^2 Score: 0.9008785652841652
