## Classification Model 

We're using the[MAGIC Gamma Telescope data from UCI repo](https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope) for this project. 

In [11]:
## first the imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## the sklearn modules
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
## reading the data from its url
## and checking the overall format
magic_gamma = pd.read_csv('https://archive.ics.uci.edu/static/public/159/data.csv')
magic_gamma.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [9]:
magic_gamma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020 entries, 0 to 19019
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   fLength   19020 non-null  float64
 1   fWidth    19020 non-null  float64
 2   fSize     19020 non-null  float64
 3   fConc     19020 non-null  float64
 4   fConc1    19020 non-null  float64
 5   fAsym     19020 non-null  float64
 6   fM3Long   19020 non-null  float64
 7   fM3Trans  19020 non-null  float64
 8   fAlpha    19020 non-null  float64
 9   fDist     19020 non-null  float64
 10  class     19020 non-null  object 
dtypes: float64(10), object(1)
memory usage: 1.6+ MB


In [10]:
## changing the types for improving the memory usage
def data_cleaner(df):
    datatype_dict = {}
    df.columns = [x.replace(r'\s+', '_').lower() for x in df.columns]
    for col in df.columns:
        ## dropping the columns that are mostly null values
        if df[col].isnull().sum()/df.shape[0] >= 0.5:
            df.drop(col, axis=1, inplace=True)
        elif df[col].dtype == 'object' and df[col].nunique() < 10:
            df[col] = df[col].str.replace(r'\s+','_', regex = True).str.lower()
            df = pd.get_dummies(data=df, columns = [col])
        elif df[col].dtype == 'object':
            df.drop(col, axis=1, inplace=True)
        elif df[col].dtype in ['float64', 'float32'] and df[col].min() == df.astype({col:'float16'})[col].min:
            datatype_dict[col] = 'float16'
        elif df[col].dtype == 'float64' and df[col].min() == df.astype({col:'float32'})[col].min:
            datatype_dict[col] = 'float32'
        elif df[col].dtype in ['int64', 'int32', 'int16'] and df[col].min() == df.astype({col:'int8'})[col].min:
            datatype_dict[col] = 'int8'
        elif df[col].dtype in ['int64', 'int32'] and df[col].min() == df.astype({col:'int16'})[col].min:
            datatype_dict[col] = 'int16'
        elif df[col].dtype in ['int64'] and df[col].min() == df.astype({col:'int32'})[col].min:
            datatype_dict[col] = 'int32'
    return df.astype(datatype_dict)
cleaned_magic = data_cleaner(magic_gamma.copy())
cleaned_magic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020 entries, 0 to 19019
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   flength   19020 non-null  float64
 1   fwidth    19020 non-null  float64
 2   fsize     19020 non-null  float64
 3   fconc     19020 non-null  float64
 4   fconc1    19020 non-null  float64
 5   fasym     19020 non-null  float64
 6   fm3long   19020 non-null  float64
 7   fm3trans  19020 non-null  float64
 8   falpha    19020 non-null  float64
 9   fdist     19020 non-null  float64
 10  class_g   19020 non-null  uint8  
 11  class_h   19020 non-null  uint8  
dtypes: float64(10), uint8(2)
memory usage: 1.5 MB


In [15]:
## getting the proportion of each class
print(f"% of  H : {sum(cleaned_magic['class_h'])/cleaned_magic.shape[0]}")
print(f"% of  G : {sum(cleaned_magic['class_g'])/cleaned_magic.shape[0]}")

% of  H : 0.3516298633017876
% of  G : 0.6483701366982124


We have fewer H cases compared to G.

In [17]:
## splitting the data
X, y = cleaned_magic.drop(['class_g', 'class_h'], axis=1), cleaned_magic[['class_g', 'class_h']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)


Unnamed: 0,class_g,class_h
16151,0,1
7551,1,0
16840,0,1
468,1,0
8246,1,0
...,...,...
13602,0,1
18336,0,1
16405,0,1
17859,0,1
