In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')

In [4]:
columns_to_drop = ['id', 'gill-spacing', 'stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color']
df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory

In [5]:
df_train.isna().sum()

class                        0
cap-diameter                 4
cap-shape                   40
cap-surface             671023
cap-color                   12
does-bruise-or-bleed         8
gill-attachment         523936
gill-color                  57
stem-height                  0
stem-width                   0
stem-color                  38
has-ring                    24
ring-type               128880
habitat                     45
season                       0
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
label = LabelEncoder()

In [8]:
df_train['class'] = label.fit_transform(df_train['class'])

In [9]:
df_train['cap-diameter'] = df_train['cap-diameter'].fillna(df_train['cap-diameter'].median())

In [10]:
df_test['cap-diameter'] = df_test['cap-diameter'].fillna(df_test['cap-diameter'].median())

In [11]:
df_train['cap-shape'] = label.fit_transform(df_train['cap-shape'])
known_classes = set(label.classes_)
df_test['cap-shape'] = df_test['cap-shape'].astype(str)
df_test['cap-shape'] = df_test['cap-shape'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['cap-shape'] = label.transform(df_test['cap-shape'])

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             int64  
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-color            object 
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            object 
 11  has-ring              object 
 12  ring-type             object 
 13  habitat               object 
 14  season                object 
dtypes: float64(3), int64(2), object(10)
memory usage: 356.7+ MB


In [13]:
df_train.isna().sum()

class                        0
cap-diameter                 0
cap-shape                    0
cap-surface             671023
cap-color                   12
does-bruise-or-bleed         8
gill-attachment         523936
gill-color                  57
stem-height                  0
stem-width                   0
stem-color                  38
has-ring                    24
ring-type               128880
habitat                     45
season                       0
dtype: int64

In [14]:
df_train['cap-surface'] = label.fit_transform(df_train['cap-surface'])
known_classes = set(label.classes_)
df_test['cap-surface'] = df_test['cap-surface'].astype(str)
df_test['cap-surface'] = df_test['cap-surface'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['cap-surface'] = label.transform(df_test['cap-surface'])

In [15]:
df_train['cap-color'] = label.fit_transform(df_train['cap-color'])
known_classes = set(label.classes_)
df_test['cap-color'] = df_test['cap-color'].astype(str)
df_test['cap-color'] = df_test['cap-color'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['cap-color'] = label.transform(df_test['cap-color'])

In [16]:
df_train['does-bruise-or-bleed'] = label.fit_transform(df_train['does-bruise-or-bleed'])
known_classes = set(label.classes_)
df_test['does-bruise-or-bleed'] = df_test['does-bruise-or-bleed'].astype(str)
df_test['does-bruise-or-bleed'] = df_test['does-bruise-or-bleed'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['does-bruise-or-bleed'] = label.transform(df_test['does-bruise-or-bleed'])

In [17]:
df_train['gill-attachment'] = label.fit_transform(df_train['gill-attachment'])
known_classes = set(label.classes_)
df_test['gill-attachment'] = df_test['gill-attachment'].astype(str)
df_test['gill-attachment'] = df_test['gill-attachment'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['gill-attachment'] = label.transform(df_test['gill-attachment'])

In [18]:
df_train['gill-color'] = label.fit_transform(df_train['gill-color'])
known_classes = set(label.classes_)
df_test['gill-color'] = df_test['gill-color'].astype(str)
df_test['gill-color'] = df_test['gill-color'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['gill-color'] = label.transform(df_test['gill-color'])

In [19]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             int64  
 3   cap-surface           int64  
 4   cap-color             int64  
 5   does-bruise-or-bleed  int64  
 6   gill-attachment       int64  
 7   gill-color            int64  
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            object 
 11  has-ring              object 
 12  ring-type             object 
 13  habitat               object 
 14  season                object 
dtypes: float64(3), int64(7), object(5)
memory usage: 356.7+ MB


In [20]:
df_train['stem-color'] = label.fit_transform(df_train['stem-color'])
known_classes = set(label.classes_)
df_test['stem-color'] = df_test['stem-color'].astype(str)
df_test['stem-color'] = df_test['stem-color'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['stem-color'] = label.transform(df_test['stem-color'])

In [21]:
df_train['has-ring'] = label.fit_transform(df_train['has-ring'])
known_classes = set(label.classes_)
df_test['has-ring'] = df_test['has-ring'].astype(str)
df_test['has-ring'] = df_test['has-ring'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['has-ring'] = label.transform(df_test['has-ring'])

In [22]:
df_train['ring-type'] = label.fit_transform(df_train['ring-type'])
known_classes = set(label.classes_)
df_test['ring-type'] = df_test['ring-type'].astype(str)
df_test['ring-type'] = df_test['ring-type'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['ring-type'] = label.transform(df_test['ring-type'])

In [23]:
df_train['habitat'] = label.fit_transform(df_train['habitat'])
known_classes = set(label.classes_)
df_test['habitat'] = df_test['habitat'].astype(str)
df_test['habitat'] = df_test['habitat'].apply(lambda x: x if x in known_classes else 'Unknown')
if 'Unknown' not in label.classes_:
    label.classes_ = np.append(label.classes_, 'Unknown')
    df_test['habitat'] = label.transform(df_test['habitat'])

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             int64  
 3   cap-surface           int64  
 4   cap-color             int64  
 5   does-bruise-or-bleed  int64  
 6   gill-attachment       int64  
 7   gill-color            int64  
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            int64  
 11  has-ring              int64  
 12  ring-type             int64  
 13  habitat               int64  
 14  season                object 
dtypes: float64(3), int64(11), object(1)
memory usage: 356.7+ MB


In [25]:
df_train.isna().sum()

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [26]:
df_train['season'] = label.fit_transform(df_train['season'])
df_test['season'] = label.transform(df_test['season'])

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
model = RandomForestClassifier()

In [29]:
X_train = df_train.drop("class", axis=1)
y_train = df_train["class"]
X_test = df_test.copy()

In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 int64  
 1   cap-diameter          float64
 2   cap-shape             int64  
 3   cap-surface           int64  
 4   cap-color             int64  
 5   does-bruise-or-bleed  int64  
 6   gill-attachment       int64  
 7   gill-color            int64  
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            int64  
 11  has-ring              int64  
 12  ring-type             int64  
 13  habitat               int64  
 14  season                int64  
dtypes: float64(3), int64(12)
memory usage: 356.7 MB


In [31]:
df_train.isna().sum()

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [32]:
model.fit(X_train,y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
submit = pd.read_csv('../sample_submission.csv')

In [35]:
submit['class'] = y_pred

In [36]:
submit.to_csv('sub13.csv',index = False,index_label = False)