In [163]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split,GridSearchCV
from math import sqrt
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.exceptions import NotFittedError

In [164]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')

tg=train[['COST']]
tg["COST1"] = np.log1p(tg["COST"])
target=tg.COST1
del train['COST']
del train['RESTAURANT_ID']
del test['RESTAURANT_ID']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [165]:
all_data=pd.concat([train,test])
all_data = all_data.reset_index(drop=True)

In [166]:
all_data['TITLE']=all_data['TITLE'].str.strip()
all_data['TITLE']=all_data['TITLE'].str.upper()

all_data['CUISINES']=all_data['CUISINES'].str.strip()
all_data['CUISINES']=all_data['CUISINES'].str.upper()

all_data['CITY']=all_data['CITY'].str.strip()
all_data['CITY']=all_data['CITY'].str.upper()

all_data['LOCALITY']=all_data['LOCALITY'].str.strip()
all_data['LOCALITY']=all_data['LOCALITY'].str.upper()

all_data['TIME']=all_data['TIME'].str.strip()
all_data['TIME']=all_data['TIME'].str.upper()

In [167]:
#Cleaning Rating

rates = list(all_data['RATING'])

for i in range(len(rates)) :
    try:
       rates[i] = float(rates[i])
    except :
       rates[i] = np.nan


# Votes
       
votes = list(all_data['VOTES'])

for i in range(len(votes)) :
    try:
       votes[i] = int(votes[i].split(" ")[0].strip())
    except :
       pass     

num_data = {}

num_data['RATING'] = rates
num_data['VOTES'] = votes

num_data = pd.DataFrame(num_data)

In [168]:
del all_data['RATING']
del all_data['VOTES']
feature_data=pd.concat([all_data,num_data], axis=1)

In [169]:
cat_cols=feature_data.columns[feature_data.dtypes=='object'].tolist()
print (cat_cols)

num_cols=feature_data.columns[feature_data.dtypes!='object'].tolist()
print(num_cols)

['TITLE', 'CUISINES', 'TIME', 'CITY', 'LOCALITY']
['RATING', 'VOTES']


In [170]:
cat_num_feats=pd.DataFrame(np.column_stack([feature_data[m[0]].map(dict(feature_data.groupby(m[0])[m[1]].mean()))
 for m in [(a,b) for a in cat_cols for b in num_cols]]),
columns=['cat_num_feat'+str(i) for i in range(len(cat_cols)*len(num_cols))])

In [171]:
feature_data.reset_index(drop=True,inplace=True)
feature_data=pd.concat((feature_data,cat_num_feats),axis=1)

In [172]:
class Encoding(BaseEstimator):
    categorical_columns = None
    return_df = False
    random_state = 30
    threshold = 50

    def __init__(self):
        pass

    def convert_input(self, X):
        if not isinstance(X, pd.DataFrame):
            if isinstance(X, list):
                X = pd.DataFrame(np.array(X))
            elif isinstance(X, (np.generic, np.ndarray, pd.Series)):
                X = pd.DataFrame(X)
            else:
                raise ValueError('Unexpected input type: %s' % (str(type(X))))
            X = X.apply(lambda x: pd.to_numeric(x, errors='ignore'))
        x = X.copy(deep = True)
        return x

    def get_categorical_columns(self, X):
        return X.select_dtypes(include=['object', 'category']).columns.tolist()

    def get_numerical_columns(self,X):
        temp_x=X[X.columns[X.nunique()<=self.threshold]]
        col_names=temp_x.columns[temp_x.dtypes!='object']
        return col_names

    def apply_encoding(self, X_in, encoding_dict):
        X = self.convert_input(X_in)
        for col in self.categorical_columns:
            if col in encoding_dict:
                freq_dict = encoding_dict[col]
                X[col] = X[col].apply(lambda x: freq_dict[x] if x  in freq_dict else np.nan)
        return X

    def create_encoding_dict(self, X, y):
        return {}

    def fit(self, X, y=None):
        if X is None:
            raise ValueError("Input array is required to call fit method!")
        X = self.convert_input(X)
        self.encoding_dict = self.create_encoding_dict(X, y)
        return self

    def transform(self, X):
        df = self.apply_encoding(X, self.encoding_dict)
        if self.return_df:
            return df
        else:
            return df.values

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X):
        X = self.convert_input(X)
        for col in self.categorical_columns:
            freq_dict = self.encoding_dict[col]
            for key, val in freq_dict.iteritems():
                X.loc[X[col] == val, col] = key
        if self.return_df:
            return X
        else:
            return X.values

In [173]:
class FreqeuncyEncoding(Encoding):
    '''
    class to perform FreqeuncyEncoding on Categorical Variables
    Initialization Variabes:
    categorical_columns: list of categorical columns from the dataframe
    or list of indexes of caategorical columns for numpy ndarray
    return_df: boolean
        if True: returns pandas dataframe on transformation
        else: return numpy ndarray
    '''
    def __init__(self, categorical_columns = None, return_df = False):
        self.categorical_columns = categorical_columns
        self.return_df = return_df

    def create_encoding_dict(self, X, y):
        encoding_dict = {}
        if self.categorical_columns is None:
            self.categorical_columns = self.get_categorical_columns(X)
        for col in self.categorical_columns:
            encoding_dict.update({col: X[col].value_counts(normalize = True).to_dict()})
        return encoding_dict


fe=FreqeuncyEncoding(categorical_columns=cat_cols,return_df=True)
feature_data1=fe.fit_transform(feature_data)

In [174]:
titles = list(all_data['TITLE'])

# Finding Maximum number of titles mentioned in a single cell
maxim = 1
for i in titles :
    if len(i.split(',')) > maxim:
         maxim = len(i.split(','))
         
print("\n\nMaximum Titles in a Cell : ", maxim)    

all_titles = []

for i in titles :
    if len(i.split(',')) == 1:
         all_titles.append(i.split(',')[0].strip().upper())
    else :
        for it in range(len(i.split(','))):
            all_titles.append(i.split(',')[it].strip().upper())

print("\n\nNumber of Unique Titles : ", len(pd.Series(all_titles).unique()))
print("\n\nUnique Titles:\n", pd.Series(all_titles).unique())

all_titles = list(pd.Series(all_titles).unique())


for i in range(25):
    ttl=all_titles[i]
    ttl1=ttl+'1'
    all_data[ttl1] = all_data['TITLE'].str.contains(ttl)
    all_data[ttl1] = all_data[ttl1].map({True: 1, False: 0})
    
    
#del all_data['TITLE']



Maximum Titles in a Cell :  2


Number of Unique Titles :  25


Unique Titles:
 ['CASUAL DINING' 'BAR' 'QUICK BITES' 'DESSERT PARLOR' 'CAFE'
 'MICROBREWERY' 'BEVERAGE SHOP' 'IRANI CAFE' 'BAKERY' 'NONE' 'PUB'
 'FINE DINING' 'SWEET SHOP' 'LOUNGE' 'FOOD COURT' 'FOOD TRUCK' 'MESS'
 'KIOSK' 'CLUB' 'CONFECTIONERY' 'DHABA' 'MEAT SHOP' 'COCKTAIL BAR'
 'PAAN SHOP' 'BHOJANALYA']


In [175]:
# Analysing cuisines 

cuisines = list(all_data['CUISINES'])

maxim = 1
for i in cuisines :
    if len(i.split(',')) > maxim:
         maxim = len(i.split(','))
         
print("\n\nMaximum cuisines in a Cell : ", maxim)    

all_cuisines = []

for i in cuisines :
    if len(i.split(',')) == 1:
         #print(i.split(',')[0])
         all_cuisines.append(i.split(',')[0].strip().upper())
    else :
        for it in range(len(i.split(','))):
            #print(i.split(',')[it])
            all_cuisines.append(i.split(',')[it].strip().upper())

print("\n\nNumber of Unique Cuisines : ", len(pd.Series(all_cuisines).unique()))
print("\n\nUnique Cuisines:\n", pd.Series(all_cuisines).unique())

all_cuisines = list(pd.Series(all_cuisines).unique())

for i in range(130):
    ttl=all_cuisines[i]
    ttl2=ttl+'2'
    all_data[ttl2] = all_data['CUISINES'].str.contains(ttl)
    all_data[ttl2] = all_data[ttl2].map({True: 1, False: 0})
    
    
#del all_data['CUISINES']



Maximum cuisines in a Cell :  8


Number of Unique Cuisines :  130


Unique Cuisines:
 ['MALWANI' 'GOAN' 'NORTH INDIAN' 'ASIAN' 'MODERN INDIAN' 'JAPANESE'
 'CHINESE' 'BIRYANI' 'HYDERABADI' 'TIBETAN' 'DESSERTS' 'SEAFOOD' 'CAFE'
 'PIZZA' 'BURGER' 'BAR FOOD' 'SOUTH INDIAN' 'FAST FOOD' 'BEVERAGES'
 'ARABIAN' 'MUGHLAI' 'MAHARASHTRIAN' 'PARSI' 'THAI' 'BAKERY' 'MOMOS'
 'CONTINENTAL' 'EUROPEAN' 'ROLLS' 'ANDHRA' 'ITALIAN' 'BBQ' 'FINGER FOOD'
 'TEA' 'AMERICAN' 'HEALTHY FOOD' 'COFFEE' 'INDONESIAN' 'KOREAN' 'NEPALESE'
 'ICE CREAM' 'MEXICAN' 'KERALA' 'INDIAN' 'MITHAI' 'STREET FOOD'
 'MALAYSIAN' 'VIETNAMESE' 'IRANIAN' 'KEBAB' 'JUICES' 'SANDWICH'
 'MEDITERRANEAN' 'SALAD' 'GUJARATI' 'RAJASTHANI' 'TEX-MEX' 'ROAST CHICKEN'
 'BURMESE' 'CHETTINAD' 'NORTH EASTERN' 'LEBANESE' 'COFFEE AND TEA' 'GRILL'
 '' 'BIHARI' 'BENGALI' 'LUCKNOWI' 'AWADHI' 'STEAK' 'FRENCH' 'PORTUGUESE'
 'WRAPS' 'SRI LANKAN' 'ORIYA' 'ETHIOPIAN' 'KONKAN' 'SUSHI' 'SPANISH'
 'RUSSIAN' 'MANGALOREAN' 'TURKISH' 'BUBBLE TEA' 'AFGHAN' 'NAGA'
 '

In [176]:
all_data=all_data.loc[:, (all_data != 0).any(axis=0)]
print(all_data.shape)

(16921, 160)


In [177]:
all_data['MON-TUE']=np.where(all_data['TIME'].str.contains('(MON-TUE)'),1,0)
all_data['MON-WED']=np.where(all_data['TIME'].str.contains('(MON-WED)'),1,0)
all_data['MON-THU']=np.where(all_data['TIME'].str.contains('(MON-THU)'),1,0)
all_data['MON-FRI']=np.where(all_data['TIME'].str.contains('(MON-FRI)'),1,0)
all_data['MON-SAT']=np.where(all_data['TIME'].str.contains('(MON-SAT)'),1,0)
all_data['MON-SUN']=np.where(all_data['TIME'].str.contains('(MON-SUN)'),1,0)
all_data['TUE-WED']=np.where(all_data['TIME'].str.contains('(TUE-WED)'),1,0)
all_data['TUE-THU']=np.where(all_data['TIME'].str.contains('(TUE-THU)'),1,0)
all_data['TUE-FRI']=np.where(all_data['TIME'].str.contains('(TUE-FRI)'),1,0)
all_data['TUE-SAT']=np.where(all_data['TIME'].str.contains('(TUE-SAT)'),1,0)
all_data['TUE-SUN']=np.where(all_data['TIME'].str.contains('(TUE-SUN)'),1,0)
all_data['WED-THU']=np.where(all_data['TIME'].str.contains('(WED-THU)'),1,0)
all_data['WED-FRI']=np.where(all_data['TIME'].str.contains('(WED-FRI)'),1,0)
all_data['WED-SAT']=np.where(all_data['TIME'].str.contains('(WED-SAT)'),1,0)
all_data['WED-SUN']=np.where(all_data['TIME'].str.contains('(WED-SUN)'),1,0)
all_data['THU-FRI']=np.where(all_data['TIME'].str.contains('(THU-FRI)'),1,0)
all_data['THU-SAT']=np.where(all_data['TIME'].str.contains('(THU-SAT)'),1,0)
all_data['THU-SUN']=np.where(all_data['TIME'].str.contains('(THU-SUN)'),1,0)
all_data['FRI-SAT']=np.where(all_data['TIME'].str.contains('(FRI-SAT)'),1,0)
all_data['FRI-SUN']=np.where(all_data['TIME'].str.contains('(FRI-SUN)'),1,0)
all_data['SAT-SUN']=np.where(all_data['TIME'].str.contains('(SAT-SUN)'),1,0)

all_data['MON']=np.where(all_data['TIME'].str.contains('MON'),1,0)
all_data['TUE']=np.where(all_data['TIME'].str.contains('TUE'),1,0)
all_data['WED']=np.where(all_data['TIME'].str.contains('WED'),1,0)
all_data['THU']=np.where(all_data['TIME'].str.contains('THU'),1,0)
all_data['FRI']=np.where(all_data['TIME'].str.contains('FRI'),1,0)
all_data['SAT']=np.where(all_data['TIME'].str.contains('SAT'),1,0)
all_data['SUN']=np.where(all_data['TIME'].str.contains('SUN'),1,0)

all_data['AM']=np.where(all_data['TIME'].str.contains('AM'),1,0)
all_data['PM']=np.where(all_data['TIME'].str.contains('PM'),1,0)
all_data['AM_cnt']=all_data['TIME'].str.count('AM')
all_data['PM_cnt']=all_data['TIME'].str.count('PM')
all_data['NOON']=np.where(all_data['TIME'].str.contains('NOON'),1,0)
all_data['MIDNIGHT']=np.where(all_data['TIME'].str.contains('MIDNIGHT'),1,0)
all_data['CLOSED']=np.where(all_data['TIME'].str.contains('CLOSED'),1,0)
all_data['HOURS']=np.where(all_data['TIME'].str.contains('HOURS'),1,0)
all_data['TIME_CNT']=all_data['TIME'].str.len()
all_data['comma_count'] = all_data['TIME'].str.count(',')
all_data['dash_count'] = all_data['TIME'].str.count('-')
all_data['collon_count'] = all_data['TIME'].str.count(':')
all_data['dotted_count'] = all_data['TIME'].str.count('...')
#all_data['TIME_no_count'] = all_data['TIME'].str.count(r'[0-9]')
#all_data['TIME_str_count'] = all_data['TIME'].str.count(r'[A-Z]')

all_data['1:15AM']=np.where(all_data['TIME'].str.contains('1:15AM'),1,0)
all_data['1:30AM']=np.where(all_data['TIME'].str.contains('1:30AM'),1,0)
all_data['1:45AM']=np.where(all_data['TIME'].str.contains('1:45AM'),1,0)
all_data['2AM']=np.where(all_data['TIME'].str.contains('2AM'),1,0)
all_data['2:15AM']=np.where(all_data['TIME'].str.contains('2:15AM'),1,0)
all_data['2:30AM']=np.where(all_data['TIME'].str.contains('2:30AM'),1,0)
all_data['2:45AM']=np.where(all_data['TIME'].str.contains('2:45AM'),1,0)
all_data['3AM']=np.where(all_data['TIME'].str.contains('3AM'),1,0)
all_data['3:15AM']=np.where(all_data['TIME'].str.contains('3:15AM'),1,0)
all_data['3:30AM']=np.where(all_data['TIME'].str.contains('3:30AM'),1,0)
all_data['3:45AM']=np.where(all_data['TIME'].str.contains('3:45AM'),1,0)
all_data['4AM']=np.where(all_data['TIME'].str.contains('4AM'),1,0)
all_data['4:15AM']=np.where(all_data['TIME'].str.contains('4:15AM'),1,0)
all_data['4:30AM']=np.where(all_data['TIME'].str.contains('4:30AM'),1,0)
all_data['4:45AM']=np.where(all_data['TIME'].str.contains('4:45AM'),1,0)
all_data['5AM']=np.where(all_data['TIME'].str.contains('5AM'),1,0)
all_data['5:15AM']=np.where(all_data['TIME'].str.contains('5:15AM'),1,0)
all_data['5:30AM']=np.where(all_data['TIME'].str.contains('5:30AM'),1,0)
all_data['5:45AM']=np.where(all_data['TIME'].str.contains('5:45AM'),1,0)
all_data['6AM']=np.where(all_data['TIME'].str.contains('6AM'),1,0)
all_data['6:15AM']=np.where(all_data['TIME'].str.contains('6:15AM'),1,0)
all_data['6:30AM']=np.where(all_data['TIME'].str.contains('6:30AM'),1,0)
all_data['6:45AM']=np.where(all_data['TIME'].str.contains('6:45AM'),1,0)
all_data['7AM']=np.where(all_data['TIME'].str.contains('7AM'),1,0)
all_data['7:15AM']=np.where(all_data['TIME'].str.contains('7:15AM'),1,0)
all_data['7:30AM']=np.where(all_data['TIME'].str.contains('7:30AM'),1,0)
all_data['7:45AM']=np.where(all_data['TIME'].str.contains('7:45AM'),1,0)
all_data['8AM']=np.where(all_data['TIME'].str.contains('8AM'),1,0)
all_data['8:15AM']=np.where(all_data['TIME'].str.contains('8:15AM'),1,0)
all_data['8:30AM']=np.where(all_data['TIME'].str.contains('8:30AM'),1,0)
all_data['8:45AM']=np.where(all_data['TIME'].str.contains('8:45AM'),1,0)
all_data['9AM']=np.where(all_data['TIME'].str.contains('9AM'),1,0)
all_data['9:15AM']=np.where(all_data['TIME'].str.contains('9:15AM'),1,0)
all_data['9:30AM']=np.where(all_data['TIME'].str.contains('9:30AM'),1,0)
all_data['9:45AM']=np.where(all_data['TIME'].str.contains('9:45AM'),1,0)
all_data['10AM']=np.where(all_data['TIME'].str.contains('10AM'),1,0)
all_data['10:15AM']=np.where(all_data['TIME'].str.contains('10:15AM'),1,0)
all_data['10:30AM']=np.where(all_data['TIME'].str.contains('10:30AM'),1,0)
all_data['10:45AM']=np.where(all_data['TIME'].str.contains('10:45AM'),1,0)
all_data['11AM']=np.where(all_data['TIME'].str.contains('11AM'),1,0)
all_data['11:15AM']=np.where(all_data['TIME'].str.contains('11:15AM'),1,0)
all_data['11:30AM']=np.where(all_data['TIME'].str.contains('11:30AM'),1,0)
all_data['11:45AM']=np.where(all_data['TIME'].str.contains('11:45AM'),1,0)
all_data['12AM']=np.where(all_data['TIME'].str.contains('12AM'),1,0)
all_data['12:15AM']=np.where(all_data['TIME'].str.contains('12:15AM'),1,0)
all_data['12:30AM']=np.where(all_data['TIME'].str.contains('12:30AM'),1,0)
all_data['12:45AM']=np.where(all_data['TIME'].str.contains('12:45AM'),1,0)
all_data['12NOON']=np.where(all_data['TIME'].str.contains('12NOON'),1,0)
        
                             
all_data['1:15PM']=np.where(all_data['TIME'].str.contains('1:15PM'),1,0)
all_data['1:30PM']=np.where(all_data['TIME'].str.contains('1:30PM'),1,0)
all_data['1:45PM']=np.where(all_data['TIME'].str.contains('1:45PM'),1,0)
all_data['2PM']=np.where(all_data['TIME'].str.contains('2PM'),1,0)
all_data['2:15PM']=np.where(all_data['TIME'].str.contains('2:15PM'),1,0)
all_data['2:30PM']=np.where(all_data['TIME'].str.contains('2:30PM'),1,0)
all_data['2:45PM']=np.where(all_data['TIME'].str.contains('2:45PM'),1,0)
all_data['3PM']=np.where(all_data['TIME'].str.contains('3PM'),1,0)
all_data['3:15PM']=np.where(all_data['TIME'].str.contains('3:15PM'),1,0)
all_data['3:30PM']=np.where(all_data['TIME'].str.contains('3:30PM'),1,0)
all_data['3:45PM']=np.where(all_data['TIME'].str.contains('3:45PM'),1,0)
all_data['4PM']=np.where(all_data['TIME'].str.contains('4PM'),1,0)
all_data['4:15PM']=np.where(all_data['TIME'].str.contains('4:15PM'),1,0)
all_data['4:30PM']=np.where(all_data['TIME'].str.contains('4:30PM'),1,0)
all_data['4:45PM']=np.where(all_data['TIME'].str.contains('4:45PM'),1,0)
all_data['5PM']=np.where(all_data['TIME'].str.contains('5PM'),1,0)
all_data['5:15PM']=np.where(all_data['TIME'].str.contains('5:15PM'),1,0)
all_data['5:30PM']=np.where(all_data['TIME'].str.contains('5:30PM'),1,0)
all_data['5:45PM']=np.where(all_data['TIME'].str.contains('5:45PM'),1,0)
all_data['6PM']=np.where(all_data['TIME'].str.contains('6PM'),1,0)
all_data['6:15PM']=np.where(all_data['TIME'].str.contains('6:15PM'),1,0)
all_data['6:30PM']=np.where(all_data['TIME'].str.contains('6:30PM'),1,0)
all_data['6:45PM']=np.where(all_data['TIME'].str.contains('6:45PM'),1,0)
all_data['7PM']=np.where(all_data['TIME'].str.contains('7PM'),1,0)
all_data['7:15PM']=np.where(all_data['TIME'].str.contains('7:15PM'),1,0)
all_data['7:30PM']=np.where(all_data['TIME'].str.contains('7:30PM'),1,0)
all_data['7:45PM']=np.where(all_data['TIME'].str.contains('7:45PM'),1,0)
all_data['8PM']=np.where(all_data['TIME'].str.contains('8PM'),1,0)
all_data['8:15PM']=np.where(all_data['TIME'].str.contains('8:15PM'),1,0)
all_data['8:30PM']=np.where(all_data['TIME'].str.contains('8:30PM'),1,0)
all_data['8:45PM']=np.where(all_data['TIME'].str.contains('8:45PM'),1,0)
all_data['9PM']=np.where(all_data['TIME'].str.contains('9PM'),1,0)
all_data['9:15PM']=np.where(all_data['TIME'].str.contains('9:15PM'),1,0)
all_data['9:30PM']=np.where(all_data['TIME'].str.contains('9:30PM'),1,0)
all_data['9:45PM']=np.where(all_data['TIME'].str.contains('9:45PM'),1,0)
all_data['10PM']=np.where(all_data['TIME'].str.contains('10PM'),1,0)
all_data['10:15PM']=np.where(all_data['TIME'].str.contains('10:15PM'),1,0)
all_data['10:30PM']=np.where(all_data['TIME'].str.contains('10:30PM'),1,0)
all_data['10:45PM']=np.where(all_data['TIME'].str.contains('10:45PM'),1,0)
all_data['11PM']=np.where(all_data['TIME'].str.contains('11PM'),1,0)
all_data['11:15PM']=np.where(all_data['TIME'].str.contains('11:15PM'),1,0)
all_data['11:30PM']=np.where(all_data['TIME'].str.contains('11:30PM'),1,0)
all_data['11:45PM']=np.where(all_data['TIME'].str.contains('11:45PM'),1,0)
all_data['12PM']=np.where(all_data['TIME'].str.contains('12PM'),1,0)
all_data['12:15PM']=np.where(all_data['TIME'].str.contains('12:15PM'),1,0)
all_data['12:30PM']=np.where(all_data['TIME'].str.contains('12:30PM'),1,0)
all_data['12:45PM']=np.where(all_data['TIME'].str.contains('12:45PM'),1,0)
all_data['12MIDNIGHT']=np.where(all_data['TIME'].str.contains('12MIDNIGHT'),1,0)

all_data['CITY_LEN']=all_data['CITY'].str.len()
all_data['LOCALITY_LEN']=all_data['LOCALITY'].str.len()
all_data['CITY_wrd'] = all_data.CITY.apply(lambda x: len(str(x).split(' ')))
all_data['LOCALITY_wrd'] = all_data.LOCALITY.apply(lambda x: len(str(x).split(' '))) 
all_data['CITY_no_count'] = all_data['CITY'].str.count(r'[0-9]')
all_data['LOCALITY_no_count'] = all_data['LOCALITY'].str.count(r'[0-9]')
all_data['CITY_str_count'] = all_data['CITY'].str.count(r'[A-Z]')
all_data['LOCALITY_str_count'] = all_data['LOCALITY'].str.count(r'[A-Z]')

all_data['TITLE_LEN']=all_data['TITLE'].str.len()
all_data['CUISINES_LEN']=all_data['CUISINES'].str.len()
all_data['TITLE_wrd1'] = all_data.TITLE.apply(lambda x: len(str(x).split(' ')))
all_data['CUISINES_wrd1'] = all_data.CUISINES.apply(lambda x: len(str(x).split(' ')))
all_data['TITLE_wrd2'] = all_data.TITLE.apply(lambda x: len(str(x).split(',')))
all_data['CUISINES_wrd2'] = all_data.CUISINES.apply(lambda x: len(str(x).split(',')))

del all_data['TITLE']
del all_data['CUISINES']

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [178]:
all_data=all_data.loc[:, (all_data != 0).any(axis=0)]
print(all_data.shape)

(16921, 293)


In [179]:
del all_data['TIME']
del all_data['CITY']
del all_data['LOCALITY']

In [180]:
all_data_new=pd.concat((feature_data1,all_data),axis=1)

In [181]:
print("\nContains NaN/Empty cells : ", all_data_new.isnull().values.any())
all_data_new.fillna(0, inplace = True)
print("\nContains NaN/Empty cells : ", all_data_new.isnull().values.any())
all_data_new=all_data_new.loc[:, (all_data_new != 0).any(axis=0)]
print(all_data_new.shape)


Contains NaN/Empty cells :  True

Contains NaN/Empty cells :  False
(16921, 307)


In [182]:
train1=all_data_new[:12690]
test1=all_data_new[12690:16921]

In [183]:
X_train, X_validation, y_train, y_validation = train_test_split(train1, target, train_size=0.7, random_state=1234)



In [184]:
feature_names = train1.columns.tolist()

# LightGBM dataset formatting 
lgtrain = lgb.Dataset(X_train, y_train,
                feature_name=feature_names)
lgvalid = lgb.Dataset(X_validation, y_validation,
                feature_name=feature_names)

In [185]:
lgtall = lgb.Dataset(train1, target,
                feature_name=feature_names)

In [186]:
params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'num_leaves' : 50, 
    'max_depth': 15,  
    'learning_rate' : 0.1,
    'feature_fraction' : 0.9,
    'verbosity' : -1,
    'feature_fraction_seed': 1234,
    'bagging_seed': 1234,
    'colsample_bytree': 0.99,
    'max_bin': 256,
    'num_iterations': 10000,
    #'reg_alpha': 5,
    #'reg_lambda': 10,
    #'min_split_gain': 0.4,
    'min_child_weight': 2,
    'min_child_samples':4,
}


lgb_clf = lgb.train(
    params,
    lgtall,
    num_boost_round=10000,
    valid_sets=[lgtrain, lgvalid],
    valid_names=["train", "valid"],
    early_stopping_rounds=500,
    verbose_eval=500)

print("RMSE of the validation set:", np.sqrt(mean_squared_error(y_validation, lgb_clf.predict(X_validation))))



Training until validation scores don't improve for 500 rounds.
[500]	train's rmse: 0.139763	valid's rmse: 0.133789
[1000]	train's rmse: 0.0679046	valid's rmse: 0.0646077
[1500]	train's rmse: 0.034467	valid's rmse: 0.0327563
[2000]	train's rmse: 0.0189989	valid's rmse: 0.017825
[2500]	train's rmse: 0.0121435	valid's rmse: 0.0109232
[3000]	train's rmse: 0.00930055	valid's rmse: 0.00811166
[3500]	train's rmse: 0.00850692	valid's rmse: 0.00718294
[4000]	train's rmse: 0.00824762	valid's rmse: 0.00691135
[4500]	train's rmse: 0.0081711	valid's rmse: 0.00681892
[5000]	train's rmse: 0.00814935	valid's rmse: 0.00679497
[5500]	train's rmse: 0.00814281	valid's rmse: 0.00678469
[6000]	train's rmse: 0.00814081	valid's rmse: 0.00678198
[6500]	train's rmse: 0.00814023	valid's rmse: 0.00678106
[7000]	train's rmse: 0.00814012	valid's rmse: 0.00678065
[7500]	train's rmse: 0.00814004	valid's rmse: 0.00678064
Early stopping, best iteration is:
[7177]	train's rmse: 0.00814012	valid's rmse: 0.00678051
RMSE o

In [187]:
ypred = lgb_clf.predict(X_train, num_iteration=lgb_clf.best_iteration)
ypred_val = lgb_clf.predict(X_validation, num_iteration=lgb_clf.best_iteration)

print(sqrt(mean_squared_error(y_train, ypred)))
print(sqrt(mean_squared_error(y_validation, ypred_val)))

0.008140122407075975
0.006780511847326791


In [188]:
yall = lgb_clf.predict(data=train1,num_iteration=lgb_clf.best_iteration)
yall=pd.DataFrame(yall)
yall.columns=['COST1']
yall["COST"] = np.expm1(yall["COST1"])

print(sqrt(mean_squared_error(tg.COST, yall.COST)))

2.1949415084571267


In [189]:
y_all=lgb_clf.predict(data=test1,num_iteration=lgb_clf.best_iteration)
y_all=pd.DataFrame(y_all)
y_all.columns=['COST1']
y_all["COST"] = np.expm1(y_all["COST1"])

del y_all["COST1"]

y_all.to_excel('submission_lightGBM1_log.xlsx', index=False)#0.8447

In [190]:
lgb_clf.save_model('model_lgb_log.txt')

<lightgbm.basic.Booster at 0x19f99cb44a8>

In [191]:
#Stacking

y_all=lgb_clf.predict(data=test1,num_iteration=lgb_clf.best_iteration)
y_all=pd.DataFrame(y_all)
y_all.columns=['lgb1']
y_all["lgb"] = np.expm1(y_all["lgb1"])
del y_all["lgb1"]

y_all.to_excel('submission_lightGBM_stck_val_log.xlsx', index=False)

y_all=lgb_clf.predict(data=train1,num_iteration=lgb_clf.best_iteration)
y_all=pd.DataFrame(y_all)
y_all.columns=['lgb1']
y_all["lgb"] = np.expm1(y_all["lgb1"])
del y_all["lgb1"]

y_all.to_excel('submission_lightGBM_stck_train_log.xlsx', index=False)