In [5]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

In [6]:
# Dobbiamo cazzo ringraziare questo kebabbaro!
# Penso sia il miglior kernel che abbia mai letto da un anno a questa parte
# https://www.kaggle.com/sudalairajkumar/simple-exploration-baseline-ga-customer-revenue

### Import dataframes

In [7]:
# Import the dataframes without JSON features
train_raw_df = pd.read_csv("toDF_train(1).csv",
    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
test_raw_df = pd.read_csv("toDF_test(1).csv",
    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
train_raw_df.shape, test_raw_df.shape

((903653, 55), (804684, 53))

### Target variable exploration 

In [8]:
import matplotlib.pyplot as plt

# Change dtype to float
train_raw_df["totals.transactionRevenue"] = train_raw_df["totals.transactionRevenue"].astype('float')

# Since we are predicting the natural log of sum of all transactions of the user, let us sum up the transaction revenue at user level
gdf = train_raw_df.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()

# Let us take a log and then do a scatter plot
plt.figure(figsize=(8,6))
plt.scatter(range(gdf.shape[0]), np.sort(np.log1p(gdf["totals.transactionRevenue"].values)))
plt.xlabel('index', fontsize=12)
plt.ylabel('TransactionRevenue', fontsize=12)
plt.show()

<Figure size 800x600 with 1 Axes>

### Number of visitors and common visiotrs

In [9]:
print("Number of unique visitors in train set : ",train_raw_df.fullVisitorId.nunique(), " out of rows : ",train_raw_df.shape[0])
print("Number of unique visitors in test set : ",test_raw_df.fullVisitorId.nunique(), " out of rows : ",test_raw_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_raw_df.fullVisitorId.unique()).intersection(set(test_raw_df.fullVisitorId.unique()))))

Number of unique visitors in train set :  714167  out of rows :  903653
Number of unique visitors in test set :  617242  out of rows :  804684
Number of common visitors in train and test set :  7679


### Columns with constant values

In [10]:
const_cols = [c for c in train_raw_df.columns if train_raw_df[c].nunique(dropna=False)==1]
const_cols

['socialEngagementType',
 'device.browserSize',
 'device.browserVersion',
 'device.flashVersion',
 'device.language',
 'device.mobileDeviceBranding',
 'device.mobileDeviceInfo',
 'device.mobileDeviceMarketingName',
 'device.mobileDeviceModel',
 'device.mobileInputSelector',
 'device.operatingSystemVersion',
 'device.screenColors',
 'device.screenResolution',
 'geoNetwork.cityId',
 'geoNetwork.latitude',
 'geoNetwork.longitude',
 'geoNetwork.networkLocation',
 'totals.visits',
 'trafficSource.adwordsClickInfo.criteriaParameters']

## Checkpoint

### Drop constant columns

In [16]:
def drop_constant_columns(df, features):
    df.drop(features, axis=1, inplace=True)
    return df

train_df = drop_constant_columns(train_raw_df, const_cols)
test_df = drop_constant_columns(test_raw_df, const_cols)
train_df.shape, test_df.shape

((903653, 36), (804684, 34))

### Features differences between train and test

In [17]:
print("Variables not in test but in train : ", set(train_df.columns).difference(set(test_df.columns)))

Variables not in test but in train :  {'trafficSource.campaignCode', 'totals.transactionRevenue'}


### Drop trafficSource.campaignCode from train_df, and sessionID from both train_df and test_df

In [18]:
train_df.drop(['trafficSource.campaignCode', 'sessionId'], axis=1, inplace=True)
test_df.drop(['sessionId'], axis=1, inplace=True)

### Conversion from object to string and from float to integer

In [None]:
#convert all columns in parameter int_col_vec to integer
def float_to_int(df):

    #list of columns to be converted to integer
    int_col_vec = ['totals.bounces', 'totals.newVisits', 'totals.newVisits', 'totals.pageviews', 'trafficSource.adwordsClickInfo.page']

    for el in int_col_vec:    
        train_df[el] = train_df[el].fillna(-1)
        train_df[el] = train_df[el].astype(int)
        train_df[el] = train_df[el].replace('-1', np.nan)

In [None]:
def obj_to_str(df):
    
    #list of columns to be converted to strings
    str_col_vec = 

In [1]:
train_df['trafficSource.adwordsClickInfo.page'][0:30]

NameError: name 'train_df' is not defined

In [20]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null object
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device.browser                                  903653 non-null object
device.deviceCategory                           903653 non-null object
device.isMobile                                 903653 non-null bool
device.operatingSystem                          903653 non-null object
geoNetwork.city                                 903653 non-null object
geoNetwork.continent                      

In [21]:
test_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,...,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,Chrome,desktop,False,...,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,Chrome,desktop,False,...,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,Safari,mobile,True,...,,,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,Safari,desktop,False,...,,,,,(not set),,(not provided),organic,,google


In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 34 columns):
channelGrouping                                 804684 non-null object
date                                            804684 non-null object
fullVisitorId                                   804684 non-null object
sessionId                                       804684 non-null object
visitId                                         804684 non-null int64
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device.browser                                  804684 non-null object
device.deviceCategory                           804684 non-null object
device.isMobile                                 804684 non-null bool
device.operatingSystem                          804684 non-null object
geoNetwork.city                                 804684 non-null object
geoNetwork.continent                      

### Export Dataframes

In [23]:
train_df.to_csv('cleaned_train(2).csv', index = False)
test_df.to_csv('cleaned_test(2).csv', index = False)