# Load in our libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

**Load and check data**

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")



Below is the first 5 rows of test dataset:

In [None]:
train.head()

The dimension and number of missing values in the train dataset is as below:


In [None]:
print(f'Number of rows: {train.shape[0]};  Number of columns: {train.shape[1]}; No of missing values: {sum(train.isna().sum())}')

# Infos

In [None]:
train.info()

# Summarie and statistics

In [None]:
train.describe().T

The dimension and number of missing values in the test dataset is as below:

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 8))

target_count = train['Cover_Type'].value_counts().sort_index()

ax.bar(target_count.index, target_count, color=['#1520E6' if i%2==0 else '#93D1FF' for i in range(9)],
       width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax.margins(0.02, 0.05)

for i in range(1,8):
    ax.annotate(f'{target_count[i]/len(train)*100:.3}', xy=(i, target_count[i]+1000),
                   va='center', ha='center',
               )
#Annotate the point xy with text text.

#In the simplest form, the text is placed at xy.

ax.set_title('Cover_Type Distribution', weight='bold', fontsize=15)
ax.grid(axis='y', linestyle='-', alpha=0.4)

fig.tight_layout()
plt.show()

In [None]:
target_count = train['Cover_Type'].value_counts().sort_index()
target_count_df = pd.DataFrame(target_count)
#pd.options.display.float_format = '{:,.2f}%'.format
target_count_df['Cover_Type(%)'] = (target_count_df/target_count.sum()*100)
target_count_df.sort_values('Cover_Type(%)', ascending=False, inplace=True)
display(target_count_df)

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv",nrows=400000)


In [None]:
train.drop(["Id"] , axis = 1 , inplace = True)

In [None]:
y=train['Cover_Type']
X=train.drop(labels=['Cover_Type'], axis=1)

### 1 Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important
for solving the problem statement

In [None]:
### It will zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X)

In [None]:
var_thres.get_support()

In [None]:
### Finding non constant features
sum(var_thres.get_support())

In [None]:
# Lets Find non-constant features 
len(X.columns[var_thres.get_support()])

In [None]:
constant_columns = [column for column in X.columns
                    if column not in X.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
for column in constant_columns:
    print(column)

train.drop(corr_features,axis=1)

### 2. Feature Selection- With Correlation
In this step we will be removing the features which are highly correlated 

In [None]:
from sklearn.datasets import load_boston

In [None]:
X.corr()

In [None]:
import seaborn as sns
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = X.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X, 0.7)
len(set(corr_features))

In [None]:
corr_features

train.drop(corr_features,axis=1)

## 3.Feature Selection-Information gain - mutual information In Classification Problem Statements

Mutual Information
MI Estimate mutual information for a discrete target variable.

Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances.

Inshort

A quantity called mutual information measures the amount of information one can obtain from one random variable given another.

The mutual information between two random variables X and Y can be stated formally as follows:

I(X ; Y) = H(X) – H(X | Y) Where I(X ; Y) is the mutual information for X and Y, H(X) is the entropy for X and H(X | Y) is the conditional entropy for X given Y. The result has the units of bits.

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv",nrows=400000)

train.drop(["Id"] , axis = 1 , inplace = True)
y=train['Cover_Type']
X=train.drop(labels=['Cover_Type'], axis=1)


In [None]:
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X, y)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

In [None]:
#let's plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
#No we Will select the  top 5 important features
sel_five_cols = SelectKBest(mutual_info_classif, k=5)
sel_five_cols.fit(X, y)


In [None]:
sel_five_cols.get_support()

In [None]:
# Lets Find non-constant features 
len(X.columns[sel_five_cols.get_support()])

In [None]:
Five_columns = [column for column in X.columns
                    if column  in X.columns[sel_five_cols.get_support()]]

for column in Five_columns:
    print(column)