In [None]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



In [None]:
%matplotlib inline

# Reducción de datos con Sample

In [None]:
size = 10000
df_chunk = pd.read_csv('train.csv', chunksize=size)

df_empty = pd.DataFrame()
for chunk in df_chunk:
    
    df_empty = df_empty.append(chunk.sample(n=79))


In [None]:
df_empty.shape

In [None]:
#header = True
#df_empty.to_csv('chunkP4.csv', header=header, mode='a')
#header = False

In [None]:
dataset = pd.read_csv('chunkP4.csv')

In [None]:
dataset.info()

In [None]:
dataset.shape

# Datos vacíos

In [None]:
nulos= dataset.isnull().sum()
nulos[0:74]

total_cells = np.product(dataset.shape)
print (total_cells)

total_missing = nulos.sum()
print (total_missing)

(float (total_missing)/total_cells)*100

In [None]:
#sns.boxplot(dataset.loc[dataset['orig_destination_distance'].notnull()]['orig_destination_distance'])
#sns.countplot(dataset.loc[dataset['orig_destination_distance'].notnull()]['orig_destination_distance'])

#dataset['Age'].fillna((dataset['Age'].mean()), inplace=True) # Data without outliers
dataset['orig_destination_distance'].fillna((dataset['orig_destination_distance'].median()), inplace=True) # Data with outliers
#dataset['catCluster'].fillna((dataset['catCluster'].median()), inplace=True) # Data with outliers

In [None]:
nulos= dataset.isnull().sum()
nulos[0:74]

total_cells = np.product(dataset.shape)
print (total_cells) 

total_missing = nulos.sum()
print (total_missing)

(float (total_missing)/total_cells)*100

In [None]:
datosVacios = dataset.loc[dataset['hotel_cluster']]['orig_destination_distance'].mean()
dataset.loc[(dataset['orig_destination_distance'].isnull()) & (dataset['hotel_cluster']), 'orig_destination_distance'] = datosVacios

In [None]:
dataset.head()

# Análisis de características

In [None]:
plt.subplots(figsize = (15,10)) 
plt.title("Hotel Cluster", fontsize = 25,loc = 'center', pad = 20)

dataset.loc[(dataset['hotel_cluster'] < 20), 'catCluster'] = 1
dataset.loc[(dataset['hotel_cluster'] > 20) & (dataset['hotel_cluster'] < 40), 'catCluster'] = 2
dataset.loc[(dataset['hotel_cluster'] > 40) & (dataset['hotel_cluster'] < 60), 'catCluster'] = 3
dataset.loc[(dataset['hotel_cluster'] > 60) & (dataset['hotel_cluster'] < 80), 'catCluster'] = 4
dataset.loc[(dataset['hotel_cluster'] > 80), 'catCluster'] = 5

sns.countplot(dataset['catCluster'])

#sns.countplot(dataset['hotel_cluster'])

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Site name", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(dataset['site_name'])

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Site name", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="site_name", hue="is_booking", data=dataset)

In [None]:
#Transformación de las columnas srch_ci y srch_co, de esta forma se trabajará con la columna 'mes' para ver las temporadas.
dataset['srch_ci'] = pd.to_datetime(dataset['srch_ci'])

dataset['ci_day'] = dataset["srch_ci"].apply(lambda x: x.day)
dataset['ci_month'] = dataset["srch_ci"].apply(lambda x: x.month)
#dataset['ci_year'] = dataset["srch_ci"].apply(lambda x: x.year)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Month", fontsize = 25,loc = 'center', pad = 40)
sns.countplot(x="ci_month", order=list(range(1, 13)), data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Day", fontsize = 25,loc = 'center', pad = 40)
sns.countplot(x="ci_day", order=list(range(1, 32)), data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Month & Package", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="ci_month", hue="is_package", order=list(range(1, 13)), data=dataset)

In [None]:
dataset['srch_co'] = pd.to_datetime(dataset['srch_co'])

#dataset['co_day'] = dataset["srch_ci"].apply(lambda x: x.day)
dataset['co_month'] = dataset["srch_ci"].apply(lambda x: x.month)
#dataset['co_year'] = dataset["srch_ci"].apply(lambda x: x.year)

In [None]:
plt.subplots(figsize = (15,10)) 
plt.title("Hotel Continent", fontsize = 25,loc = 'center', pad = 20)

sns.countplot(dataset['hotel_continent'])

In [None]:
plt.subplots(figsize = (15,10)) 
plt.title("Favorite Continent", fontsize = 25,loc = 'center', pad = 20)

sns.countplot(dataset['posa_continent'])

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Diference", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="hotel_continent", hue="posa_continent", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Mobile reservation", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="is_mobile", hue="posa_continent", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Channel", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="channel", order=list(range(0, 10)), data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Booiking by mobile", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="is_mobile", hue="is_booking", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Destination Type", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="srch_destination_type_id", hue="posa_continent", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Adults", fontsize = 25,loc = 'center', pad = 40)

#sns.countplot(x="srch_adults_cnt", hue="is_booking", data=dataset)
sns.countplot(x="srch_children_cnt", order=list(range(0, 10)), data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Children", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="srch_children_cnt", hue="is_booking", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Package", fontsize = 25,loc = 'center', pad = 40)

sns.countplot(x="is_package", hue="is_booking", data=dataset)

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Hotel rooms", fontsize = 25,loc = 'center', pad = 40)

#sns.countplot(x="srch_rm_cnt", hue="is_booking", data=dataset)
sns.countplot(dataset['srch_rm_cnt'])

In [None]:
plt.subplots(figsize = (15,10))
plt.title("Similar events", fontsize = 25,loc = 'center', pad = 40)

sns.barplot(x="cnt", y="is_booking", data=dataset)

In [None]:
plt.subplots(figsize = (15,10)) 
plt.title("hotel_country", fontsize = 25,loc = 'center', pad = 20)

dataset.loc[(dataset['hotel_country'] < 20), 'country'] = "20"
dataset.loc[(dataset['hotel_country'] > 20) & (dataset['hotel_country'] < 40), 'country'] = "40"
dataset.loc[(dataset['hotel_country'] > 40) & (dataset['hotel_country'] < 60), 'country'] = "60"
dataset.loc[(dataset['hotel_country'] > 60) & (dataset['hotel_country'] < 80), 'country'] = "80"
dataset.loc[(dataset['hotel_country'] > 80), 'country'] = "100"

sns.countplot(dataset.loc[dataset['hotel_country'].notnull()]['country'])

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="hotel_country", data=dataset)
dataset.loc[(dataset['hotel_country'] > 200)]
index_NaN_age = list(dataset.loc[(dataset['hotel_country'] > 200)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="hotel_market", data=dataset)
dataset.loc[(dataset['hotel_market'] > 1500)]
index_NaN_age = list(dataset.loc[(dataset['hotel_market'] > 1500)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="cnt", data=dataset)
dataset.loc[(dataset['cnt'] > 30)]
index_NaN_age = list(dataset.loc[(dataset['cnt'] > 30)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="site_name", data=dataset)
dataset.loc[(dataset['site_name'] > 30)]
index_NaN_age = list(dataset.loc[(dataset['site_name'] > 30)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="orig_destination_distance", data=dataset)
dataset.loc[(dataset['orig_destination_distance'] > 6000)]
index_NaN_age = list(dataset.loc[(dataset['orig_destination_distance'] > 6000)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="user_location_region", data=dataset)
dataset.loc[(dataset['user_location_region'] > 750)]
index_NaN_age = list(dataset.loc[(dataset['user_location_region'] > 750)].index)
dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
plt.subplots(figsize = (20,8))
sns.boxplot(x="user_location_country", data=dataset)
#dataset.loc[(dataset['user_location_country'] > 70) & (dataset['user_location_country'] < 60)]
#index_NaN_age = list(dataset.loc[(dataset['user_location_country'] > 70) & (dataset['user_location_country'] < 60)].index)
#dataset.drop(index_NaN_age, axis=0, inplace=True)

In [None]:
dataset.shape

# División del conjunto de datos

In [None]:
targets = dataset['catCluster']
feature_vector = dataset.drop(['catCluster', 'hotel_cluster', 'srch_ci', 'srch_co', 'date_time', 'user_id', 'orig_destination_distance'], axis = 1)

feature_vector.info()

In [None]:
columns_to_transform = []

feature_vector.loc[feature_vector['NAME_CONTRACT_TYPE'].isnull(), 'NAME_CONTRACT_TYPE'] = 0

for column in columns_to_transform:
    le = preprocessing.LabelEncoder()
    feature_vector[column] = le.fit_transform(feature_vector[column])
    
feature_vector.head()

# PCA

In [None]:
pca = PCA(n_components=0.96, svd_solver='full') 

# Model training
pca.fit(feature_vector)

# Model transformation
new_feature_vector = pca.transform(feature_vector)

# Model information:
print('Model information:')
print('Number of components elected: %s' % pca.n_components)
print('New feature dimension: %s' % pca.n_components_)
print('Variance of every feature: %s' % pca.explained_variance_ratio_)

# First 10 rows of new feature vector
print('New feature vector: %s' % new_feature_vector[:10])

# Una dimension es la correcta según PCA.

# Attribute subset selection with trees

In [None]:
extra_tree = ExtraTreesClassifier()

# Model training
extra_tree.fit(feature_vector, targets)

# Model information:
print('Model information:')

# display the relative importance of each attribute
print('Importance of every feature: ' + str(extra_tree.feature_importances_))

# If model was training before prefit = True
model = SelectFromModel(extra_tree, prefit=True)

# Model transformation
new_feature_vector = model.transform(feature_vector)

# First 10 rows of new feature vector
print('New feature vector: ' + str(new_feature_vector[:10]))

# Normalozación Z-score

In [None]:
# Data standardization
standardized_data = preprocessing.scale(feature_vector)

# First 10 rows of new feature vector
print('New feature vector: %s', standardized_data[:5])

# Normalización Min-Max

In [None]:
# Data normalization
min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler.fit(feature_vector)

# Model information:
print('Model information:')
print('Data min: %s', min_max_scaler.data_min_)
print('Data max: %s', min_max_scaler.data_max_)

normalized_data = min_max_scaler.transform(feature_vector)

# First 10 rows of new feature vector
print('New feature vector: %s', normalized_data[:5])

# Modelos

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(feature_vector, targets, test_size=0.25)

In [None]:
names = ["Bagging Classifier", "AdaBoost Classifier"]
models = [
        BaggingClassifier(
            base_estimator=tree.DecisionTreeClassifier(
                criterion='gini',
                max_depth=10)
        ),
        AdaBoostClassifier(
            base_estimator=tree.DecisionTreeClassifier(
                criterion='gini',
                max_depth=10)
        )]

for name, em_clf in zip(names, models):
    print("###################---" + name + "---###################")

    em_clf.fit(X_train, Y_train)

    # Model evaluation
    test_data_predicted = em_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, test_data_predicted)

    print("Model Score: %s", score)
    print("Confusion Matrix:")

    print(metrics.confusion_matrix(Y_test, test_data_predicted))
    
    #Kappa Statistics
    
    print("Evaluation report")

    print(classification_report(Y_test, test_data_predicted))

    print("Kappa Statistic: %s" % (str(cohen_kappa_score(Y_test, test_data_predicted))))
    
    
    #Cross Validation
    kf = KFold(n_splits=10, shuffle=True)

    score_array = []
    score_array.append(score)

    print("Cross Validation: %s" %(str(np.average(score_array))))
    

In [None]:
names = ["MLP Classifier", "Random Forest Classifier"]
models = [
        MLPClassifier(
        hidden_layer_sizes=(50),
        activation="relu",
        solver="adam")
    ,
        RandomForestClassifier(
            criterion='gini',
            max_depth=10
        )
    ]

for name, em_clf in zip(names, models):
    print("###################---" + name + "---###################")

    em_clf.fit(X_train, Y_train)

    # Model evaluation
    test_data_predicted = em_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, test_data_predicted)

    print("Model Score: %s", score)
    print("Confusion Matrix:")

    print(metrics.confusion_matrix(Y_test, test_data_predicted))
    
    #Kappa Statistics
    
    print("Evaluation report")

    print(classification_report(Y_test, test_data_predicted))

    print("Kappa Statistic: %s" % (str(cohen_kappa_score(Y_test, test_data_predicted))))
    
    
    #Cross Validation
    kf = KFold(n_splits=10, shuffle=True)

    score_array = []
    score_array.append(score)

    print("Cross Validation: %s" %(str(np.average(score_array))))
    

In [None]:
names = ["DecisionTreeClassifier", "GaussianNB"]
models = [
       
        tree.DecisionTreeClassifier()
    , 
        GaussianNB()
    ]

for name, em_clf in zip(names, models):
    print("###################---" + name + "---###################")

    em_clf.fit(X_train, Y_train)

    # Model evaluation
    test_data_predicted = em_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, test_data_predicted)

    print("Model Score: %s", score)
    print("Confusion Matrix:")

    print(metrics.confusion_matrix(Y_test, test_data_predicted))
    
    #Kappa Statistics
    
    print("Evaluation report")

    print(classification_report(Y_test, test_data_predicted))

    print("Kappa Statistic: %s" % (str(cohen_kappa_score(Y_test, test_data_predicted))))
    
    
    #Cross Validation
    kf = KFold(n_splits=10, shuffle=True)

    score_array = []
    score_array.append(score)

    print("Cross Validation: %s" %(str(np.average(score_array))))
    
