In [None]:
import numpy as np
import pandas as pd

from category_encoders import BinaryEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error

import matplotlib.pylab as plt
import seaborn as sns

In [None]:
df = pd.read_csv("asteroid.csv")
df.head(5)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
#dropping columns and keeping only the needed ones 
df = df[['spkid', 'full_name', 'orbit_id',
    'e', 'a', 'q', 'i', 'n', 'tp', 'per', 'per_y',
       'class',
       'rms']].copy()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), annot= True)

In [None]:
#dropping the columns with very high correlation 
df.drop(['per', 'per_y'], axis=1, inplace=True)

In [None]:
df.describe()

In [None]:
df.rename(columns={'class':'c'}, inplace=True)

In [None]:
df.head(5)

In [None]:
len(df.c.unique())

In [None]:
df.drop(['spkid', 'full_name'], axis = 1, inplace= True)

In [None]:
df.head(5)

In [None]:
#binary encoding the column orbit_id as it has a lot of columns
encoded_orbit_id = BinaryEncoder().fit_transform(df['orbit_id'])
df_encoded_orbit_id = pd.DataFrame(encoded_orbit_id)
df_encoded_orbit_id.head(5)

In [None]:
#one hot encoding the column 'c' as it has fewer columns
value_c = df['c'].values.reshape(-1,1)
encoded_c = OneHotEncoder().fit_transform(df['c'])
df_encoded_c = pd.DataFrame(encoded_c)
df_encoded_c.head(5)

In [None]:
df['c'].value_counts()

In [None]:
#replacing columns c and orbit_id with its encoded one
df.drop(['c','orbit_id'], axis = 1, inplace=True )

df_output = pd.concat([df, df_encoded_c, df_encoded_orbit_id], axis = 1 )
df_output.head(5)

In [None]:
y = df_output['rms']
df_output.drop(['rms'], inplace= True, axis= 1)

X = df_output.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42 )

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

#dropping columns with correlation 0.7 and higher
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

# Compute the correlation matrix for all pairs of columns
corr_matrix = X_train[numerical_columns].corr().abs()
  
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
  
# Find pairs of columns with a correlation coefficient of 0.8 or higher
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
  
# Drop the selected columns from your dataset
X_train = X_train.drop(to_drop, axis=1)
X_test = X_test.drop(to_drop, axis=1)

In [None]:
#function for performance metrics
def performance(y_test, y_pred):
    print(f"r2_score is: {r2_score(y_test, y_pred)}")
    print(f"mse is: {mean_squared_error(y_test, y_pred)}")

In [None]:
#training the decision tree regressor 
tree_regressor = DecisionTreeRegressor(max_depth=10, random_state=0).fit(X_train, y_train)
y_pred = tree_regressor.predict(X_test)
performance(y_test, y_pred)

In [29]:
#modules to import 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#hyperparameters to fit and select the best
n_estimators = [50, 100 , 200 , 400 ]
max_depths = [5, 10 , 20, 30]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        #training the random forest regressor
        forest_regressor = RandomForestRegressor(n_estimators=n_estimator, max_depth= max_depth, random_state= 42)

        #predicting the outcomes
        y_pred = forest_regressor.fit(X_train, y_train).predict(X_test)

        #printing the performance metrics
        print(f"for n_estimators = {n_estimator} and max_depth = {max_depth}")
        print(f"r2_score is: {r2_score(y_test, y_pred)}")
        print(f"mse is: {mean_squared_error(y_test, y_pred)}")
        