### Importando as bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor



### Visualizando o arquivo

In [None]:
np.random.seed(42)
df = pd.read_csv('/Users/reinaldoblack/Downloads/diamante/historico_diamonds.csv')
df.head()

### Visualizando arquivos nulos

In [None]:
df.isnull().sum()


In [None]:
df.isna().sum()

### Obtendo informações sobre o arquivo

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
(df.x == 0).sum()

In [None]:
(df.y == 0).sum()

In [None]:
(df.z == 0).sum()

In [None]:
df[df.x == 0]

In [None]:
df[['x','y','z']]=df[['x','y','z']].replace(0,np.NaN)

In [None]:
df.corr()

In [None]:
df.plot(kind='box',figsize=(15,10),subplots=True,layout=(3,3))
plt.show()

In [None]:
def outliers(var):
    a = []
    q1 = df[var].quantile(.25)
    q2 = df[var].quantile(.5)
    q3 = df[var].quantile(.75)
    iqr = q3-q1
    ulim = float(q3+(1.5*iqr))
    llim = float(q1-(1.5*iqr))

    for i in df[var]:
        if i > ulim:
            i=np.NaN
        elif i < llim:
            i = np.NaN
        else:
            i=i
        a.append(i)
    return a

for col in df.select_dtypes(exclude='object').columns:
    df[col] = outliers(col)

In [None]:
df.isna().sum()

In [None]:
df.plot(kind='box',figsize=(15,10),subplots=True,layout=(3,3))
plt.show()

In [None]:
sns.boxplot(df['z'])

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
for i in df.select_dtypes(exclude='object').columns:
    df[i]=df[i].fillna(df[i].mean())

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df_cat = df.select_dtypes(include='object')
df_cat['cut'].value_counts()

In [None]:
df_cat['color'].value_counts()

In [None]:
df_cat['clarity'].value_counts()

In [None]:
le = LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)
df_cat

In [None]:
df = df.drop(df_cat,axis=1)

In [None]:
df = pd.concat([df,df_cat],axis=1)

In [None]:
plt.scatter(df['price'],df['carat'])

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True,cmap='YlGnBu')

In [None]:
X = df.drop('price',axis=1)
y = df['price']

In [None]:
xc = st.add_constant(X)
lm = st.OLS(y,xc).fit()

In [None]:
lm.summary()

In [None]:
vif = [variance_inflation_factor(X.values,col) for col in range(0,X.shape[1])]

In [None]:
pd.DataFrame({'vif':vif,'cols':X.columns})

In [None]:
df.corr()

In [None]:
X = df.drop(['price'],axis=1)
y = df['price']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.33,random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
y_pred

In [None]:
y_test

In [None]:
rr  = RandomForestRegressor()

In [None]:
rr.fit(X_train,y_train)
y_pred = rr.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
rr.get_params

In [None]:
n_estimators = [int(x) for x in np.linspace(10,200,10)]
max_depth = [int(x) for x in np.linspace(10,100,10)]
min_samples_split = [2,3,4,5,10]
min_samples_leaf = [1,2,4,10,15,20]
random_grid = {'n_estimators':n_estimators,'max_depth':max_depth,
               'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}

random_grid

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               cv = 3)

rf_random.fit(X_train,y_train)

In [None]:
y_pred = rf_random.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
rf_random.best_params_

In [None]:
rf = RandomForestRegressor(n_estimators=178,
                         min_samples_split=5,
                         min_samples_leaf=1,
                         max_depth=50)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
r2_score(y_test,y_pred)

In [None]:
df.to_csv('diamante3f.csv')