# Discover and Visualize the Data to Gain Insights

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('./data2.csv')
data.head()

In [None]:
data.info()

There are not null values

In [None]:
data.drop_duplicates()
data.describe()

In [None]:
data['fertility'].value_counts()

- 0 describes low-fertility
- 1 describes medium-fertility
- 2 describes high-fertility

In [None]:
work_data = data.copy()
work_data.head()

In [None]:
y = work_data[['fertility']]
X = work_data.drop('fertility', axis=1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
corr = X.corr()

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(corr,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)

In [None]:
X.hist(bins=50, figsize=(15,15))
plt.show()

# Prepare the Data for Machine Learning Algorithms

### Feature Scaling

In [None]:
transformed_X = X.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x)

In [None]:
transformed_X.hist(bins=50, figsize=(15,15))
plt.show()

# Select and Train a Model

### Splitting data into train and val

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, random_state = 42)
X_train.shape

In [None]:
from sklearn import tree, ensemble
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
y_train_ravel = y_train.values.ravel()

In [None]:
svcClf = SVC()
svcClf.fit(X_train, y_train_ravel)
svc_pred = SVCClf.predict(X_val)

In [None]:
forestClf = ensemble.RandomForestClassifier()
forestClf.fit(X_train, y_train_ravel)
forest_pred = forestClf.predict(X_val)

In [None]:
nbClf = GaussianNB()
nbClf.fit(X_train, y_train_ravel)
nb_pred = nbClf.predict(X_val)

In [None]:
knnClf = KNeighborsClassifier()
knnClf.fit(X_train, y_train_ravel)
knn_pred = knnClf.predict(X_val)

In [None]:
treeClf = tree.DecisionTreeClassifier()
treeClf.fit(X_train, y_train_ravel)
tree_pred = treeClf.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score

models = [svcClf, forestClf, nbClf, knnClf, treeClf]
accs = []
titles = []

for model in models:
    pred = model.predict(X_val)
    model_acc = accuracy_score(y_val, pred)
    accs.append(model_acc)
    titles.append(type(model).__name__)
    print(type(model).__name__, " accuarcy is ", model_acc)

fig = plt.figure(figsize=(10, 5))
sns.barplot(x = titles, y=accs)

### Pros & Cons
**SVC**
1. clear margin of separation between classes
2. more effective in high dimensional spaces
3. effective in cases where the number of dimensions is greater than the number of samples
4. memory efficient                                                                                                          
5. not suitable for large data sets                                                                                          

**Random Forest Classifier**
1. quite fast
2. able to deal with unbalanced and missing data
3. may over-fit data sets that are particularly noisy
4. data-hungry

**GaussianNB**
1. very fast 
2. better than other models with less training data if the assumption of independence of features holds
3. If you have categorical input variables, the Naive Bayes algorithm performs exceptionally well in comparison to numerical variables
4. effectively works in Multi-class predictions.

**KNeighborsClassifier**
1. robust to noisy data
2. effective if the training data is large

**DecisionTreeClassifier**
1. does not require normalization of data
2. does not require scaling of data as well
3. higher time to train the model

In [None]:
forest = ensemble.RandomForestClassifier(random_state=42)

In [None]:
param_grid = { 
    'n_estimators': [200, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6 ,7 ,8, 9, 10],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5)
clf.fit(X_train, y_train_ravel)

In [None]:
clf.best_params_