In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SVMSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
df=pd.read_csv("/kaggle/input/nasa-nearest-earth-objects/neo_v2.csv")


In [None]:
df.head()

In [None]:
df.info()

<h3> Using Label Encoder to convert non-numerical features to numerical features

In [None]:
le=LabelEncoder()
non_n_feature=['name','orbiting_body','sentry_object']
for i in non_n_feature:
    df[i]=le.fit_transform(df[i])
    
    

In [None]:
df.head()

In [None]:
df.info()

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y.value_counts().plot(kind='bar')

<h2>Generate Synthetic Samples using SMOTE to fix imbalance data

In [None]:
smote = SVMSMOTE()
X_re, y_re = smote.fit_resample(X, y)

In [None]:
y_re.value_counts().plot(kind='bar')

<h3> Normalisation

In [None]:
scaler = StandardScaler()
scaler.fit(X_re)
X_re = scaler.transform(X_re)

<h3> Feature Selection

In [None]:
#using rfe - recursive feature elimination
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

rfe = RFE(model, n_features_to_select=6)

rfe.fit(X_re, y_re)

In [None]:
selected_features = rfe.support_

In [None]:
selected_feature_names=[]
for i in range(len(selected_features)):
    if(selected_features[i]==True):
        selected_feature_names.append(df.columns[i])
print("Selected features:")
print(selected_feature_names)

In [None]:
#correlation matrix
sns.heatmap(df.corr(), annot=True)
plt.suptitle('Bivariate Correlations')
plt.show()

In [None]:
#The correlation heatmap shows that est_diameter_min and est_diameter_max are perfectly correlated. 
#Hence keep only one of these variables.
#Also we can add relative_velocity in its place since it has a higher correlation with target variable than est_diameter_min
selected_feature_names.remove('est_diameter_min')
selected_feature_names.append('relative_velocity')

In [None]:
selected_feature_names

In [None]:
df_final=pd.DataFrame(X_re,columns=df.columns[:-1])

In [None]:
df_final.head()

In [None]:
df_final=df_final[selected_feature_names]

In [None]:
df_final['hazardous']=y_re

In [None]:
df_final.head()

In [None]:
df_final.shape

# EDA

In [None]:
fig = make_subplots(rows=len(selected_feature_names), cols=1)

for i, col in enumerate(selected_feature_names, start=1):
    fig.add_trace(go.Histogram(x=df_final[col], name=col),row=i,col=1)

fig.update_layout(height=400*len(selected_feature_names), showlegend=True)

fig.show()

Bivariate Analysis

In [None]:
sns.pairplot(df_final[selected_feature_names+['hazardous']],hue = 'hazardous')

MODEL BUILD AND TRAIN

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_final.iloc[:,:-1], df_final.iloc[:,-1], test_size=0.25, random_state=42)

In [None]:
X_train.shape, X_test.shape,y_train.shape, y_test.shape

In [None]:
X_train.describe()

In [None]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel_score={}
kernel_score['Precision']=[]
kernel_score['F1 Score']=[]


In [None]:
for kernel in kernels:
    print(f'SVM MODEL USING {kernel.upper()} KERNEL')
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    
    kernel_score['Precision'].append(precision)
    kernel_score['F1 Score'].append(f1)
    print(f"Kernel: {kernel}")
    print(f"Precision: {precision:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    confusion_mtrx = confusion_matrix(y_test, y_pred)
    sns.heatmap(confusion_mtrx, annot=True, fmt='g', cbar=False)
    plt.xlabel('Predicted Values')
    plt.ylabel('Actual Values')
    plt.title('Confusion Matrix')
    plt.show()

    

<h2>COMPARING RESULTS WITH DIFFERENT MODELS

In [None]:
df_kernel=pd.DataFrame(kernel_score)

In [None]:
df_kernel['Type']=kernels

In [None]:
fig=px.histogram(df_kernel,x='Type',y=['Precision','F1 Score'],
            template='plotly_dark',
            color_discrete_sequence=['gold','snow'],
            title='Precision and F1 Score Comparison'
               )
fig.update_layout(
    xaxis_title="Kernel Type", yaxis_title="Score"
    )
fig.show()