In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import urllib.request
import missingno as msno
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# %%
url = 'https://raw.githubusercontent.com/goitacademy/MACHINE-LEARNING-NEO/main/datasets/mod_05_topic_10_various_data.pkl'
with urllib.request.urlopen(url) as fl:
    datasets = pickle.load(fl)
    
# %%
autos = datasets['autos']

In [None]:
auto = autos.copy()

In [None]:
print(type(auto))
if isinstance(auto, dict):
    for k in auto:
        print(f'{k}: {type(auto[k])}')
    autos = auto['Autos']
else:
    autos = auto
# %%
print("\nDataset 'autos' preview:")
print(autos.head(10))
print()
# %%
print()
print(autos.info())

In [None]:
msno.matrix(autos)

In [None]:
print()
print(autos.isnull().sum())

# %% 
# Automatic identification of categorical features
categorical_features = autos.select_dtypes(include=['object', 'category']).columns.tolist()

# %% 
# Additionally - discrete numerical features
for col in autos.select_dtypes(include=['int64', 'float64']).columns:
    if autos[col].nunique() <20 and col != 'price':
        categorical_features.append(col)
print('Categorical\discret features:', categorical_features)

# %% 
# Label Encoding of categorical/discrete features
autos_encoded = autos.copy()

le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    autos_encoded[col] = le.fit_transform(autos_encoded[col].astype(str))
    le_dict[col] = le
print()
print(autos_encoded.head(10))
# %% 
# Target variable
y = autos_encoded['price']
print()
print(y.sort_values())
# %% 
# All impute variables except price
X = autos_encoded.drop(columns=['price'])

# %% 
# Mutual inframation
mi_scores = mutual_info_regression(X, y, discrete_features=[col in categorical_features for col in X.columns])
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print()
print('Mutual Inforamtion:\n', mi_series)

# %% 
# Distribute on train\test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

# %% 
# The Model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print()
print('The importance of sings from RandomForest:\n', importances)

# %% 
# Reduction to rank percentages
mi_rank = mi_series.rank(pct=True)
importance_rank = importances.rank(pct=True)

# %% 
# Unite
ranks_autos = pd.DataFrame({
    'Feature': X.columns,
    'MI Rank': mi_rank,
    'Model Importance Rank': importance_rank}).set_index('Feature')
ranks_autos.head(10)

# %% 
# Melt for seaborn
ranks_melted = ranks_autos.reset_index().melt(id_vars='Feature',
                                              var_name = 'Metric',
                                              value_name = 'Rank')
ranks_melted.head(10)

# %% 
# Draw of a schedule
feature_order = ranks_autos.mean(axis=1).sort_values(ascending=False).index
sns.set(style='whitegrid')

g = sns.catplot(data=ranks_melted,
            kind='bar',
                x='Rank',
                y='Feature',
                hue='Metric',
                order=feature_order,
                height=8,
                aspect=1.2,
                palette='muted',
                legend_out=False)
g.set_titles('Comparison of MI rank values and feature importance')
g.set_xlabels("Rank (Normalized)")
g.set_ylabels("Feature")
plt.tight_layout()
plt.show()