In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [92]:
# Read the train and the test data 
train_users = pd.read_csv('../data/train_users_2.csv')
test_users = pd.read_csv('../data/test_users.csv')


# Extracting labels from the train data
train_users_labels = train_users.loc[:,'country_destination']
print (train_users_labels.head(n=5))

# Extracting attributes from the train data
train_users_attrs = train_users.iloc[:,0:15]
print(train_users_attrs.head(n=5))

train_users = train_users_attrs

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object
           id date_account_created  timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28          20090319043255                NaN   
1  820tgsjxq7           2011-05-25          20090523174809                NaN   
2  4ft3gnwmtx           2010-09-28          20090609231247         2010-08-02   
3  bjjt8pjhuk           2011-12-05          20091031060129         2012-09-08   
4  87mebub9p4           2010-09-14          20091208061105         2010-02-18   

      gender   age signup_method  signup_flow language affiliate_channel  \
0  -unknown-   NaN      facebook            0       en            direct   
1       MALE  38.0      facebook            0       en               seo   
2     FEMALE  56.0         basic            3       en            direct   
3     FEMALE  42.0      facebook            0       en            direct   
4  -unknown-  41.0         basic     

In [4]:
train_users = train_users.drop(['date_first_booking'], axis=1)
test_users = test_users.drop(['date_first_booking'], axis=1)

In [5]:
# Date is split into 3 parts as year, month and day in both test and train. These are added as
# new features in both test and train

date_acc_created = np.vstack(train_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
train_users['created_year'] = date_acc_created[:,0]
train_users['created_month'] = date_acc_created[:,1]
train_users['created_day'] = date_acc_created[:,2]
train_users = train_users.drop(['date_account_created'], axis=1)

date_acc_created_test = np.vstack(test_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
test_users['created_year'] = date_acc_created_test[:,0]
test_users['created_month'] = date_acc_created_test[:,1]
test_users['created_day'] = date_acc_created_test[:,2]
test_users = test_users.drop(['date_account_created'], axis=1)

In [6]:
# Replacing unknown values in gender with -1 and null values with -1
train_users.loc[ train_users['gender'] == '-unknown-', 'gender'] = -1
train_users.loc[ train_users['gender'].isnull(), 'gender' ] = -1
test_users.loc[ test_users['gender'] == '-unknown-', 'gender'] = -1
test_users.loc[ test_users['gender'].isnull(), 'gender'] = -1

In [7]:
# Encoding Female with 0, Male with 1 and Other with 2 in both test and train data
gender_translation = {'FEMALE' : 0,
                     'MALE' : 1,
                     'OTHER' : 2,
                     -1 : -1 }
for data in [train_users, test_users]:
    data['gender'] = data['gender'].apply(lambda x: gender_translation[x])

In [9]:
# Finding valid values for gender and invalid values for gender
nan_gender_count = len(train_users.loc[train_users['gender'] == -1, 'gender'])
valid_gender_count = len(train_users.gender.values) - nan_gender_count

# Creating a map with the gender distribution
count_map = train_users['gender'].value_counts()
print("Existing gender value distribution")

for k, v in count_map.items():   # use .items() instead of .iteritems()
    if k == -1:
        continue
    print(k, ":", float(v) / float(valid_gender_count))


Existing gender value distribution
0 : 0.5353209412124351
1 : 0.46228441870536585
2 : 0.002394640082198993


In [11]:
for k, v in count_map.items():
    if k == -1:
        continue
    c = int(nan_gender_count * float(v) / float(valid_gender_count))
    missing_idx = train_users.index[train_users["gender"] == -1][:c]
    train_users.loc[missing_idx, "gender"] = k


In [12]:
train_users.gender.describe()

count    213451.000000
mean          0.467067
std           0.503700
min          -1.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           2.000000
Name: gender, dtype: float64

In [14]:
nan_gender_count = len(test_users.loc[test_users['gender'] == -1, 'gender'])
valid_gender_count = len(test_users.gender) - nan_gender_count

# use Series.value_counts()
count_map = test_users['gender'].value_counts()

print("Existing gender value distribution")
for k, v in count_map.items():   # use .items() instead of .iteritems()
    if k == -1:
        continue
    print(k, ":", float(v) / float(valid_gender_count))

for k, v in count_map.items():
    if k == -1:
        continue
    c = int(nan_gender_count * float(v) / float(valid_gender_count))
    for i in range(len(test_users.gender.values)):
        if test_users.gender.values[i] == -1:
            test_users.gender.values[i] = k
            c -= 1
        if c == 0:
            break

test_users.loc[62094, "gender"] = 0  # safer than .values[62094]


Existing gender value distribution
0 : 0.5116944601469757
1 : 0.486468343697004
2 : 0.0018371961560203504


In [15]:
train_users['age'].describe()

count    125461.000000
mean         49.668335
std         155.666612
min           1.000000
25%          28.000000
50%          34.000000
75%          43.000000
max        2014.000000
Name: age, dtype: float64

In [16]:
# Replacing invalid age with NaN in test and train

train_users.loc[train_users['age'] > 95, 'age'] = np.nan
train_users.loc[train_users['age'] < 16, 'age'] = np.nan
test_users.loc[test_users['age'] > 95, 'age'] = np.nan
test_users.loc[test_users['age'] < 16, 'age'] = np.nan

In [17]:
# Replace missing age with median
print (train_users.age.median())
print (test_users.age.median())
train_users.loc[ train_users['age'].isnull(), 'age' ] = train_users.age.median()
test_users.loc[ test_users['age'].isnull(), 'age' ] = test_users.age.median()

34.0
31.0


In [18]:
# Encoding the signup method for test
signup_translation = {'facebook' : 0,
                     'google' : 1,
                     'basic' : 2,
                     'weibo' : 3}
for data in [train_users, test_users]:
    data['signup_method'] = data['signup_method'].apply(lambda x: signup_translation[x])

In [19]:
# Encoding the language in both train and test
test_users.loc[ test_users['language'] == '-unknown-', 'language'] = "en"

In [20]:
language_encoding = {'en'      :       1       ,
'zh'      :       2       ,
'fr'      :       3       ,
'es'      :       4       ,
'ko'      :       5       ,
'de'      :       6       ,
'it'      :       7       ,
'ru'      :       8       ,
'pt'      :       9       ,
'ja'      :       10      ,
'sv'      :       11      ,
'nl'      :       12      ,
'tr'      :       13      ,
'da'      :       14      ,
'pl'      :       15      ,
'cs'      :       16      ,
'no'      :       17      ,
'el'      :       18      ,
'th'      :       19      ,
'id'      :       20      ,
'hu'      :       21      ,
'fi'      :       22      ,
'ca'      :       23      ,
'is'      :       24      ,
'hr'      :       25}

for data in [train_users, test_users]:
    data['language'] = data['language'].apply(lambda x: language_encoding[x])

In [21]:
# Encoding for affiliate_channel
affiliate_channel_encoding = {'direct' : 1,
                             'sem-brand' : 2,
                             'sem-non-brand' : 3,
                             'other' : 4,
                             'api' : 5,
                             'seo' : 6,
                             'content' : 7,
                             'remarketing' : 8}
for data in [train_users, test_users]:
    data['affiliate_channel'] = data['affiliate_channel'].apply(lambda x: affiliate_channel_encoding[x])

In [22]:
# Encoding for affiliate_provider
affiliate_provider_encoding = {'direct':1,
'google':2,
'other':3,
'craigslist':4,
'bing':5,
'facebook':6,
'vast':7,
'padmapper':8,
'facebook-open-graph':9,
'yahoo':10,
'gsp':11,
'meetup':12,
'email-marketing':13,
'naver':14,
'baidu':15,
'yandex':16,
'wayn':17,
'daum':18}

for data in [train_users, test_users]:
    data['affiliate_provider'] = data['affiliate_provider'].apply(lambda x: affiliate_provider_encoding[x])

In [23]:
# Encoding for first_affiliate_tracked
train_users.loc[ train_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"
test_users.loc[ test_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"
first_affiliate_tracked_encoding = {'untracked' : 1,
                                   'linked' : 2,
                                   'omg' : 3,
                                   'tracked-other' : 4,
                                   'product' : 5,
                                   'marketing' : 6,
                                   'local ops' : 7}
for data in [train_users, test_users]:
    data['first_affiliate_tracked'] = data['first_affiliate_tracked'].apply(lambda x: first_affiliate_tracked_encoding[x])

In [24]:
# Encoding for signup_app
signup_app_encoding = {'Web' : 1,
                      'iOS' : 2,
                      'Android' : 3,
                      'Moweb' : 4}
for data in [train_users, test_users]:
    data['signup_app'] = data['signup_app'].apply(lambda x: signup_app_encoding[x])

In [25]:
# Encoding for first_device_type
first_device_type_encoding = { 'Mac Desktop' : 1,
                             'iPhone' : 2,
                             'Windows Desktop' : 3,
                             'Android Phone' : 4,
                             'iPad' : 5,
                             'Android Tablet' : 6,
                             'Other/Unknown' : 7,
                             'Desktop (Other)' : 8,
                             'SmartPhone (Other)' : 9}
for data in [train_users, test_users]:
    data['first_device_type'] = data['first_device_type'].apply(lambda x: first_device_type_encoding[x])

In [26]:
# Encoding for first_browser
first_browser_encoding = {'Chrome':1,
'Safari':2,
'Firefox':3,
'-unknown-':4,
'IE':5,
'Mobile Safari':6,
'Chrome Mobile':7,
'Android Browser':8,
'AOL Explorer':9,
'Opera':10,
'Silk':11,
'Chromium':12,
'BlackBerry Browser':13,
'Maxthon':14,
'IE Mobile':15,
'Apple Mail':16,
'Sogou Explorer':17,
'Mobile Firefox':18,
'RockMelt':19,
'SiteKiosk':20,
'Iron':21,
'IceWeasel':22,
'Pale Moon':23,
'SeaMonkey':24,
'Yandex.Browser':25,
'CometBird':26,
'Camino':27,
'TenFourFox':28,
'wOSBrowser':29,
'CoolNovo':30,
'Avant Browser':31,
'Opera Mini':32,
'Mozilla':33,
'Comodo Dragon':34,
'TheWorld Browser':35,
'Crazy Browser':36,
'Flock':37,
'OmniWeb':38,
'SlimBrowser':39,
'Opera Mobile':40,
'Conkeror':41,
'Outlook 2007':42,
'Palm Pre web browser':43,
'Stainless':44,
'NetNewsWire':45,
'Kindle Browser':46,
'Epic':47,
'Googlebot':48,
'Arora':49,
'Google Earth':50,
'IceDragon':51,
'PS Vita browser':52,
'IBrowse' : 53,
'UC Browser' : 54,
'IBrowse': 55,
'Nintendo Browser' : 56}


for data in [train_users, test_users]:
    data['first_browser'] = data['first_browser'].apply(lambda x: first_browser_encoding[x])

In [27]:
# Reading sessions data
sessions = pd.read_csv('sessions.csv')

In [31]:
# frequency of each user_id in sessions data
df = sessions['user_id'].value_counts()
print (df.shape)
print (df)

(135483,)
user_id
mxqbh3ykxl    2722
0hjoc5q8nf    2644
mjbl6rrj52    2476
l5lgm3w5pc    2424
wg9413iaux    2362
              ... 
6vei32cuik       1
426kdkexgi       1
q6mj8zlbb4       1
kjl3lc2tjn       1
xjcyvroezb       1
Name: count, Length: 135483, dtype: int64


In [32]:
# Updating session_count for users present in the train data

train_users['session_count'] = train_users['id'].map(df).fillna(0).astype(int)



In [33]:
print (train_users['session_count'].max())

2644


In [34]:
# Encding for country_destination
country_destination_encoding = {'NDF': 0,
'US' : 1,
'other' : 2,
'FR' : 3,
'IT' : 4,
'GB' : 5,
'ES' : 6,
'CA' : 7,
'DE' : 8,
'NL' : 9,
'AU' : 10,
'PT' : 11}

# Convert series to frame
labels_df = train_users_labels.to_frame()

for data in [labels_df]:
    data['country_destination'] = data['country_destination'].apply(lambda x: country_destination_encoding[x])

In [38]:
#print train_users_merge.head()
from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

# numeric columns in train_users
numeric_cols = train_users.select_dtypes(include=['int64','float64']).columns
train_users[numeric_cols] = stdscaler.fit_transform(train_users[numeric_cols])

# create train_users_merge first if missing
if 'train_users_merge' not in locals():
    train_users_merge = train_users.copy()

numeric_cols_merge = train_users_merge.select_dtypes(include=['int64','float64']).columns
train_users_merge[numeric_cols_merge] = stdscaler.fit_transform(train_users_merge[numeric_cols_merge])



#test_users_scaled = stdscaler.fit_transform(test_users.values);
#test_users = pd.DataFrame(test_users_scaled, columns = test_users.columns)

In [39]:
train_users['country_destination'] = labels_df
print(train_users.head())

           id  timestamp_first_active    gender       age  signup_method  \
0  gxn3p5htnn               -4.380020 -0.927275 -0.163283      -1.596552   
1  820tgsjxq7               -4.357961  1.058038  0.287705      -1.596552   
2  4ft3gnwmtx               -4.348661 -0.927275  2.317149       0.628333   
3  bjjt8pjhuk               -4.303076 -0.927275  0.738692      -1.596552   
4  87mebub9p4               -4.283949 -0.927275  0.625945       0.628333   

   signup_flow  language  affiliate_channel  affiliate_provider  \
0    -0.427798 -0.141579          -0.582242           -0.468760   
1    -0.427798 -0.141579           2.556797            0.251719   
2    -0.035009 -0.141579          -0.582242           -0.468760   
3    -0.427798 -0.141579          -0.582242           -0.468760   
4    -0.427798 -0.141579          -0.582242           -0.468760   

   first_affiliate_tracked  signup_app  first_device_type  first_browser  \
0                -0.798954   -0.359375          -0.876174      -

In [40]:
train_users_merge['country_destination'] = labels_df
print(train_users_merge.head())

           id  timestamp_first_active    gender       age  signup_method  \
0  gxn3p5htnn               -4.380020 -0.927275 -0.163283      -1.596552   
1  820tgsjxq7               -4.357961  1.058038  0.287705      -1.596552   
2  4ft3gnwmtx               -4.348661 -0.927275  2.317149       0.628333   
3  bjjt8pjhuk               -4.303076 -0.927275  0.738692      -1.596552   
4  87mebub9p4               -4.283949 -0.927275  0.625945       0.628333   

   signup_flow  language  affiliate_channel  affiliate_provider  \
0    -0.427798 -0.141579          -0.582242           -0.468760   
1    -0.427798 -0.141579           2.556797            0.251719   
2    -0.035009 -0.141579          -0.582242           -0.468760   
3    -0.427798 -0.141579          -0.582242           -0.468760   
4    -0.427798 -0.141579          -0.582242           -0.468760   

   first_affiliate_tracked  signup_app  first_device_type  first_browser  \
0                -0.798954   -0.359375          -0.876174      -

In [41]:
train_users.to_csv('train_users_wo_merge_scale.csv',index=False)
train_users_merge.to_csv('train_users_merge_scale.csv',index=False)

In [50]:
# # 1. Recreate your merged datasets
# train_users_merge = pd.merge(train_users, sessions, left_on="id", right_on="user_id", how="left")
# test_users_merge  = pd.merge(test_users,  sessions, left_on="id", right_on="user_id", how="left")


# # 2. Keep unscaled copies
# train_users_merge_wo_scale = train_users_merge.copy()
# test_users_merge_wo_scale  = test_users_merge.copy()

# # 3. Attach labels to training set
# train_users_merge_wo_scale['country_destination'] = labels_df

# # 4. Save to CSV
# train_users_merge_wo_scale.to_csv('train_users_merge_wo_scale.csv', index=False)
# test_users_merge_wo_scale.to_csv('test_users_merge_wo_scale.csv', index=False)


sessions_agg = (
    sessions.groupby('user_id')
    .agg(
        session_count=('user_id','count'),
        total_secs=('secs_elapsed','sum')
    )
    .reset_index()
)

# Merge aggregated sessions with users
train_users_merge = train_users.merge(sessions_agg, left_on="id", right_on="user_id", how="left")
test_users_merge  = test_users.merge(sessions_agg,  left_on="id", right_on="user_id", how="left")

# drop duplicate key if you want
train_users_merge = train_users_merge.drop(columns=['user_id'])
test_users_merge  = test_users_merge.drop(columns=['user_id'])

In [51]:
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

# Creating the NDCG Scorer


In [53]:
# Reference Kaggle

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


#def ndcg_score(ground_truth, predictions, k=5):
def ndcg_score(te_labels, predict, k):
    
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best == 0:
            best = 0.000000001
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)
print(ndcg_scorer)


make_scorer(ndcg_score, response_method='predict', needs_proba=True, k=5)


In [54]:
#print train_users.head()
train_users=train_users.drop(['id'], axis=1)
#print train_users.head()

# Modeling

In [59]:
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier


In [62]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

gnb = GaussianNB()

tr_data, te_data, tr_labels, te_labels = train_test_split(
    train_users, labels_df, 
    test_size=0.33,
    random_state=20160302
)

gnb.fit(tr_data, tr_labels.values.ravel())


0,1,2
,priors,
,var_smoothing,1e-09


In [63]:
prob_arr = gnb.predict_proba(te_data)

In [73]:
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelBinarizer

# Convert labels into one-hot
lb = LabelBinarizer()
lb.fit(range(predictions.shape[1]))  # make sure it matches number of classes
ground_truth_onehot = lb.transform(ground_truth)

# Now compute NDCG
score = ndcg_score(ground_truth_onehot, predictions, k=5)
print("NDCG@5:", score)


NDCG@5: 1.0


In [71]:
# Convert ground truth to numpy
ground_truth = te_labels.to_numpy()

# Predictions from your classifier
predictions = prob_arr   # shape (n_samples, n_classes)

# Compute NDCG@5
score = ndcg_score(ground_truth, predictions, k=5)
print("NDCG@5:", score)


NDCG@5: 1.025002179071937


In [76]:
print ("NDCG Score for Naive Bayes:")
print (score)

NDCG Score for Naive Bayes:
1.0


In [78]:
print ("Accuracy Score for Naive Bayes:")
print (gnb.score(te_data,te_labels))

Accuracy Score for Naive Bayes:
1.0


# Part 2

In [80]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelBinarizer

foldnum = 0
fold_results = pd.DataFrame()

kf = KFold(n_splits=10, shuffle=True, random_state=20160302)

for train_idx, test_idx in kf.split(train_users):
    foldnum += 1
    
    tr_data, te_data, tr_labels, te_labels = folds_to_split(
        train_users, labels_df, train_idx, test_idx
    )
    
    gnb1 = GaussianNB()
    gnb1.fit(tr_data, tr_labels.values.ravel())
    prob_arr_gnb1 = gnb1.predict_proba(te_data)
    
    # convert labels to one-hot for ndcg_score
    lb = LabelBinarizer()
    lb.fit(range(prob_arr_gnb1.shape[1]))
    ground_truth = lb.transform(te_labels.to_numpy())
    
    score_gnb1 = ndcg_score(ground_truth, prob_arr_gnb1, k=5)
    fold_results.loc[foldnum, 'Ndcg_Gnb'] = score_gnb1

print(fold_results.mean())


Ndcg_Gnb    1.0
dtype: float64


# Part 3
## Using scaled data for analysis

In [82]:
from sklearn import preprocessing
train_users_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(train_users))
print (train_users_scaled.head(n=5))

         0         1         2         3         4         5         6   \
0 -4.380020 -0.927275 -0.163283 -1.596552 -0.427798 -0.141579 -0.582242   
1 -4.357961  1.058038  0.287705 -1.596552 -0.427798 -0.141579  2.556797   
2 -4.348661 -0.927275  2.317149  0.628333 -0.035009 -0.141579 -0.582242   
3 -4.303076 -0.927275  0.738692 -1.596552 -0.427798 -0.141579 -0.582242   
4 -4.283949 -0.927275  0.625945  0.628333 -0.427798 -0.141579 -0.582242   

         7         8         9         10        11        12        13  \
0 -0.468760 -0.798954 -0.359375 -0.876174 -0.971889 -3.222044 -0.006939   
1  0.251719 -0.798954 -0.359375 -0.876174 -0.971889 -2.156499 -0.315897   
2 -0.468760 -0.798954 -0.359375  0.324910  1.091035 -3.222044  0.919936   
3 -0.468760 -0.798954 -0.359375 -0.876174  0.059573 -2.156499  1.846811   
4 -0.468760 -0.798954 -0.359375 -0.876174 -0.971889 -3.222044  0.919936   

         14        15        16  
0  1.387946 -0.345061 -0.521746  
1  1.044700 -0.345061 -0.52174

In [86]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import ndcg_score, accuracy_score

foldnum = 0
fold_results = pd.DataFrame()

kf = KFold(n_splits=10, shuffle=True, random_state=20160302)

# All possible classes (needed for one-hot binarization)
classes = np.unique(labels_df)

for train, test in kf.split(train_users_scaled):
    foldnum += 1
    
    # Split train/test using your custom function
    tr_data, te_data, tr_labels, te_labels = folds_to_split(
        train_users_scaled, labels_df, train, test
    )
    
    # Train Naive Bayes
    gnb2 = GaussianNB()
    gnb2.fit(tr_data, tr_labels.values.ravel())
    
    # Predicted probabilities
    prob_arr_gnb2 = gnb2.predict_proba(te_data)
    
    # Convert ground-truth labels into one-hot (needed for ndcg_score)
    y_true_bin = label_binarize(te_labels, classes=classes)
    
    # Compute metrics
    score_ndcg = ndcg_score(y_true_bin, prob_arr_gnb2, k=5)
    score_acc = accuracy_score(te_labels, gnb2.predict(te_data))
    
    # Store results
    fold_results.loc[foldnum, 'Ndcg_Gnb'] = score_ndcg
    fold_results.loc[foldnum, 'Accuracy_Gnb'] = score_acc

# Print average scores across folds
print(fold_results.mean())


Ndcg_Gnb        1.0
Accuracy_Gnb    1.0
dtype: float64


In [91]:
# !pip freeze > requirements.txt
!{sys.executable} -m pip freeze > requirements.txt


In [89]:
!where python


C:\Users\ralma\OneDrive\Desktop\Pisa\DATA MINING\Airbnb\venv\Scripts\python.exe
C:\Users\ralma\AppData\Local\Programs\Python\Python311\python.exe
C:\Users\ralma\AppData\Local\Programs\Python\Python313\python.exe
C:\Users\ralma\AppData\Local\Microsoft\WindowsApps\python.exe


In [90]:
import sys
print(sys.executable)


C:\Users\ralma\AppData\Local\Programs\Python\Python313\python.exe
