In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing

In [2]:
train = pd.read_csv('../initial/Train.csv')
test = pd.read_csv('../initial/Test.csv')
merged = pd.concat([train.set_index('user_id'), test.set_index('user_id')])

In [3]:
def data_preparation (train_y):
    #train_y.drop('MRG', axis=1, inplace = True)
    train_y.drop('ARPU_SEGMENT', axis=1, inplace = True)
    train_y.drop("MRG", axis=1, inplace=True)
    try:
        train_y.set_index('user_id', inplace = True)
    except:
        pass
    target = None
    if 'CHURN' in train_y.columns:
        target = train_y['CHURN']
        train_y.drop('CHURN', inplace=True, axis=1)
    #Feature generating

    #Bits
    # Sum calls+data
    train_y.insert(train_y.shape[1], 'TELE_NANS_SUM', train_y[['DATA_VOLUME','ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2']].isnull().sum(axis=1))
    # Sum of another
    train_y.insert(train_y.shape[1], 'NONTELE_NANS_SUM', train_y[train_y.columns.difference(['DATA_VOLUME','ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2'])].isnull().sum(axis=1))
    # Bit for REGION?
    
    train_y.insert(train_y.shape[1], 'MONTANT_TO_REVENUE', train_y['MONTANT']/train_y['REVENUE'])
    train_y['MONTANT_TO_REVENUE'].fillna(0, inplace=True)
    
    train_y.insert(train_y.shape[1], 'FULL_CALLS_SUM', train_y[['ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2']].sum(axis=1))
    train_y['FULL_CALLS_SUM'].fillna(0, inplace=True)
    
    train_y.insert(train_y.shape[1], 'FULL_CALLS_SUM_TO_REGULATIRY', (1.0*train_y[['ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2']].sum(axis=1))/(train_y['REGULARITY']*1.0)) #нужен тут astype или нет?
    train_y['FULL_CALLS_SUM_TO_REGULATIRY'].fillna(0, inplace=True)
    
    train_y.insert(train_y.shape[1], 'DATA_VOLUME_TO_REGULARITY', (train_y['DATA_VOLUME'].astype(np.single)/(train_y['REGULARITY'].astype(np.single))))
    train_y['DATA_VOLUME_TO_REGULARITY'].fillna(0, inplace=True)
    
    # Categorical TENURE
    sorted_tenure = {i : j for i, j in zip(np.sort(train_y['TENURE'].unique()), range(8))}
    train_y['TENURE'].replace(to_replace=sorted_tenure, inplace=True)
    
    # Update 27.09
    # The only full nans rows info
    train_y['TEN_REGULARITY_MEDIAN']=train_y.groupby(['TENURE'])['REGULARITY'].apply(lambda x: x-x.median())
    train_y['TEN_REGULARITY_MEAN']=train_y.groupby(['TENURE'])['REGULARITY'].apply(lambda x: x-x.mean())
    
    # Categorical REGION
    train_y['REGION'].fillna('UNKNOWN', inplace=True)
    sorted_region = {i : j for i, j in zip(train_y['REGION'].value_counts().index, range(len(train_y['REGION'].value_counts().index)))}
    train_y['REGION'].replace(to_replace=sorted_region, inplace=True)
    
    # Update 27.09
    train_y['REG_TEN_REGULARITY_MEDIAN']=train_y.groupby(['REGION', 'TENURE'])['REGULARITY'].apply(lambda x: x-x.median())
    train_y['REG_TEN_REGULARITY_MEAN']=train_y.groupby(['REGION', 'TENURE'])['REGULARITY'].apply(lambda x: x-x.mean())
    
    train_y['TOP_PACK'].fillna('Unknown', inplace = True)
    
    train_y['FREQ_TOP_PACK'].fillna(0, inplace = True)
    train_y['MONTANT'].fillna(0, inplace = True)
    train_y['FREQUENCE_RECH'].fillna(0, inplace = True)
    train_y['REVENUE'].fillna(0, inplace = True)
    train_y['FREQUENCE'].fillna(0, inplace = True)
    train_y['ON_NET'].replace(to_replace=0, value=1, inplace=True)
    train_y['ON_NET'].fillna(0, inplace=True)
    train_y['ORANGE'].replace(to_replace=0, value=1, inplace=True)
    train_y['ORANGE'].fillna(0, inplace=True)
    train_y['TIGO'].replace(to_replace=0, value=1, inplace=True)
    train_y['TIGO'].fillna(0, inplace=True)
    train_y['ZONE1'].replace(to_replace=0, value=1, inplace=True)
    train_y['ZONE1'].fillna(0, inplace=True)
    train_y['ZONE2'].replace(to_replace=0, value=1, inplace=True)
    train_y['ZONE2'].fillna(0, inplace=True)
    train_y['DATA_VOLUME'].replace(to_replace=0, value=1, inplace=True)
    train_y['DATA_VOLUME'].fillna(0, inplace=True)
    #Popularity
    
    train_y["POPULARITY"]=train_y["TOP_PACK"].map(train_y[train_y["TOP_PACK"].notnull()]["TOP_PACK"].value_counts()/len(train_y[train_y["TOP_PACK"].notnull()]["TOP_PACK"]))

    train_y.drop('TOP_PACK', axis=1, inplace=True)
    # Move target to last position
    if target is not None:
        train_y.insert(train_y.shape[1], 'CHURN', target)

    return(train_y)

In [4]:
Preproc_train = data_preparation(train)
Preproc_train.to_csv('dataset_0_train.csv')

In [6]:
Preproc_test = data_preparation(test)
Preproc_test.to_csv('dataset_0_test.csv')

In [8]:
train_merged = data_preparation(merged)
dirty_train = train_merged.iloc[:train.shape[0]]
dirty_test = train_merged[~train_merged.index.isin(dirty_train.index)]
dirty_train.to_csv('dataset_0_train_merged.csv')
dirty_test.to_csv('dataset_0_test_merged.csv')