# Beach VS Mountain

## Imports

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/csv/mountains_vs_beaches_preferences.csv")

In [78]:
df

Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Preference
0,56,male,71477,bachelor,9,skiing,2477,urban,175,267,summer,0,1,1
1,69,male,88740,master,1,swimming,4777,suburban,228,190,fall,0,1,0
2,46,female,46562,master,0,skiing,1469,urban,71,280,winter,0,0,1
3,32,non-binary,99044,high school,6,hiking,1482,rural,31,255,summer,1,0,1
4,60,female,106583,high school,5,sunbathing,516,suburban,23,151,winter,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52439,67,female,85325,master,3,hiking,1762,suburban,51,262,fall,1,0,1
52440,37,female,110346,bachelor,7,swimming,1676,urban,259,84,spring,1,0,0
52441,53,non-binary,71080,doctorate,7,swimming,3877,urban,164,27,fall,0,1,0
52442,40,female,75174,doctorate,9,swimming,4620,suburban,173,10,summer,0,1,0


## Functions

In [84]:
def transform_list_to_dict(list):
    res = {}
    for i in range(len(list)):
        res[list[i]] = i
    return res

## Get all types of data

### Type : Genders

In [89]:
types_genders = transform_list_to_dict(df.Gender.unique())
print(types_genders)

{'male': 0, 'female': 1, 'non-binary': 2}


### Type : Education_level

In [90]:
types_education_level = transform_list_to_dict(df.Education_Level.unique())
print(types_education_level)

{'bachelor': 0, 'master': 1, 'high school': 2, 'doctorate': 3}


### Type : Travel_frequency

In [94]:
types_travel_frequency = transform_list_to_dict(df.Travel_Frequency.unique())
print(types_travel_frequency)

{np.int64(9): 0, np.int64(1): 1, np.int64(0): 2, np.int64(6): 3, np.int64(5): 4, np.int64(3): 5, np.int64(8): 6, np.int64(4): 7, np.int64(7): 8, np.int64(2): 9}


### Type : Location

In [95]:
types_locations = transform_list_to_dict(df.Location.unique())
print(types_locations)

{'urban': 0, 'suburban': 1, 'rural': 2}


### Type : Preferred_Activities

In [97]:
types_preferred_activities = transform_list_to_dict(df.Preferred_Activities.unique())
print(types_preferred_activities)

{'skiing': 0, 'swimming': 1, 'hiking': 2, 'sunbathing': 3}


### Type : Seasons

In [99]:
types_favorite_season = transform_list_to_dict(df.Favorite_Season.unique())
print(types_favorite_season)

{'summer': 0, 'fall': 1, 'winter': 2, 'spring': 3}


## Get all min and max of data

### Min / Max : Age

In [47]:
min_max_age = (df.Age.min(), df.Age.max())
print(min_max_age)

(np.int64(18), np.int64(69))


### Min / Max : Income

In [50]:
min_max_incomes = (df.Income.min(), df.Income.max())
print(min_max_incomes)

(np.int64(20001), np.int64(119999))


### Min / Max : Vacation_Budget

In [51]:
min_max_vacation_budgets = (df.Vacation_Budget.min(), df.Vacation_Budget.max())
print(min_max_vacation_budgets)

(np.int64(500), np.int64(4999))


## Let's make ML about it

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### X : data & Y : labels



In [107]:
doc = {
    "data": [],
    "label": [],
    "description": """
        data : 
            [
                [Age,
                Income,
                Favorite_Season,
                Vacation_Budget
                ]
            ]
        label : [Preference (0 : Beach, 1 : Montain]""",    
}

for index, row in df.iterrows():
    doc["data"].append(
        [
            row.Age,
            row.Income,
            types_favorite_season[row.Favorite_Season],
            row.Vacation_Budget,
        ]
    )

    doc["label"].append(row.Preference)

print(f"Data : {doc["data"][0]}\n Label : {doc["label"][0]}")

Data : [56, 71477, 0, 2477]
 Label : 1


### Séparer le jeu de donnée en jeu de données de test

In [101]:
x_train, x_test, y_train, y_test = train_test_split(doc["data"], doc["label"], test_size=0.2)

### Create the Random Forest

In [102]:
rfc = RandomForestClassifier()

In [103]:
rfc.fit(x_train, y_train)

### Make the prediction

In [104]:
ypred = rfc.predict(x_test)

In [105]:
confusion_matrix(ypred, y_test)

array([[7519, 2510],
       [ 354,  106]])

In [106]:
print(classification_report(ypred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.75      0.84     10029
           1       0.04      0.23      0.07       460

    accuracy                           0.73     10489
   macro avg       0.50      0.49      0.45     10489
weighted avg       0.91      0.73      0.81     10489

