In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sma
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("abalone_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
print(df.shape)

(4177, 9)


In [5]:
df[df['Height'] == 0]

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
1257,I,0.43,0.34,0.0,0.428,0.2065,0.086,0.115,8
3996,I,0.315,0.23,0.0,0.134,0.0575,0.0285,0.3505,6


In [6]:
means=pd.pivot_table(df,index=['Sex'],aggfunc={'Height':np.mean})
means

Unnamed: 0_level_0,Height
Sex,Unnamed: 1_level_1
F,0.158011
I,0.107996
M,0.151381


In [7]:
df['Height']=df['Height'].replace(to_replace=0,value=0.0107996)

In [8]:
df=pd.get_dummies(df)

In [9]:
X = df.drop(['Rings'], axis = 1)
y = df['Rings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Use if you want to normalize data
#sc = StandardScaler()
#sc.fit(X_train)
#X_train_std = sc.transform(X_train)
#X_test_std = sc.transform(X_test)
#X_train_std = pd.DataFrame(X_train_std, columns=X_train.columns)
#X_test_std = pd.DataFrame(X_test_std, columns=X_train.columns)

#X_train = X_train_std.values
#X_test = X_test_std.values

X_train = X_train.values
X_test = X_test.values

y_train = y_train.values
y_test = y_test.values

In [10]:
classifiers = {"LogisiticRegression": LogisticRegression(),
               "KNearest": KNeighborsClassifier(),
               "Support Vector Classifier": SVC(),
               "DecisionTreeClassifier": DecisionTreeClassifier(),
               "RandomForestClassifier" : RandomForestClassifier()}

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 25.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 23.0 % accuracy score
Classifiers:  SVC Has a training score of 24.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 20.0 % accuracy score
Classifiers:  RandomForestClassifier Has a training score of 25.0 % accuracy score
