In [1]:
import re
import numpy as np
import pandas as pd
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [4]:
df_train = pd.read_csv('train.csv',lineterminator='\n')
df_test = pd.read_csv('20190425_test.csv',lineterminator='\n')

In [5]:
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


In [6]:
df_train.isnull().sum()

ID        0
review    0
label     0
dtype: int64

In [7]:
df_train.isnull().sum()

ID        0
review    0
label     0
dtype: int64

In [8]:
df_train['label'].value_counts()

1    3361
0    2967
Name: label, dtype: int64

In [9]:
numpy_array = df_train.as_matrix()
numpy_array_test = df_test.as_matrix()
numpy_array[:4]

array([[1, 'Jo bhi ap se tou behtar hoon', 0],
       [2, 'ya Allah meri sister Affia ki madad farma', 1],
       [3, 'Yeh khud chahta a is umar main shadi krna.  had ogi', 0],
       [4, 'Tc ? Apky mun xe exe alfax achy nae lgty 😒💃', 0]],
      dtype=object)

In [10]:
numpy_array_test[115]

array([116,
       'Tahum inho ne khud ko sirf science fiction films tak he mehdood nahi rakha bal ke har mauzoo par filmy stories likhin  direction ki aur kuch films produce bhi ki'],
      dtype=object)

In [11]:
#two commom ways to clean data
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  word = re.sub(r'é', '', word)
  word = re.sub(r'§', '', word)
  word = re.sub(r'¦', '', word)
  word = re.sub(r'æ', '', word)
  word = re.sub(r'\d+', '', word)
  word = re.sub('(.*?)\d+(.*?)', '', word)
  return word.lower()
def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [12]:
X_test = numpy_array_test[:,1]
X_test

array(['Mulazmat ke bahali ke dua farma dein aur koe wzeefa bhee bata dein',
       'Dua farma dain meri sehat k luay aur meray baal girna band ho jaye 1 saal say be inteha gir rahay hain',
       'Tum khabees nahi kutti aurat ho 😂😂😂😈😈', ...,
       'Mullah Umar Ne Afghan Hukomat amp Taliban Muzakrat Ki Himayat Kar Di Afghanistan Se Qabzay K Khatmay K Liye Muzakrat Jaiz Hen Paigham ',
       'Embroidery ki puri ek side pe dhagay nikle hue, fabric is average.',
       'tu marti bht h'], dtype=object)

In [13]:
#test if there are nan 
counter = 1
for sentence in X_test:
    try:
        words = sentence.split(' ')
        counter+=1
    except:
        print(sentence)
        print(counter)

In [14]:
X_train = numpy_array[:, 1]
# Clean X here
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
y_train = numpy_array[:, 2]
X_train[:5]

[' jo bhi ap se tou behtar hoon',
 ' ya allah meri sister affia ki madad farma',
 ' yeh khud chahta a is umar main shadi krna  had ogi',
 ' tc  apky mun xe exe alfax achy nae lgty 😒💃',
 ' good']

In [15]:
print(len(X_train))
print(len(X_test))

6328
2712


In [16]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
print(y_train.shape)
y_train[:6]

(6328,)


array([0, 1, 0, 0, 1, 0], dtype=int8)

In [17]:
test1 = pd.Series(y_train)
test1.unique()

array([0, 1], dtype=int64)

In [18]:
ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

In [19]:
X_all = X_train + X_test # Combine both to fit the TFIDF vectorization.
lentrain = len(X_train)

vectorizer.fit(X_all)
X_all = vectorizer.transform(X_all)

In [20]:
vectorizer.get_feature_names()[-5:]

['賭easar ul', '鄭h', '鄭h isnan', '鄭pwa', '鄭pwa yani']

In [21]:
X_all.shape

(9040, 113355)

In [22]:
X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. 
X_test_chuli = X_all[lentrain:]

In [26]:
X_train_chuli.shape

(6328, 113355)

In [32]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [40]:
#bayesian optimization to find hyperparameter for lightgbm
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [41]:
def LGB_CV(
          min_data_in_leaf,
          feature_fraction,
          bagging_fraction,
         ):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(X_train_chuli.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_train_chuli[trn_idx],
                               label=y_train[trn_idx],
                               )
        val_data = lgb.Dataset(X_train_chuli[val_idx],
                               label=y_train[val_idx],
                               )
    
        param = {
            'max_depth': -1,
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'binary',
            'bagging_fraction':bagging_fraction,
            'feature_fraction':feature_fraction,
            'learning_rate': 0.005,
            "boosting": "gbdt",
            "bagging_freq": 5,
            "bagging_seed": 11,
            "metric": 'auc',
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        8000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 500)
        
        oof[val_idx] = clf.predict(X_train_chuli[val_idx],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        
    return metrics.roc_auc_score(y_train,oof)


In [42]:
LGB_BO = BayesianOptimization(LGB_CV, {
        'min_data_in_leaf': (2, 40),
        'bagging_fraction': (0.01, 0.999),
        'feature_fraction':(0.01, 0.999)
    })

In [43]:
LGB_BO.maximize(init_points=2,n_iter=2)

|   iter    |  target   | baggin... | featur... | min_da... |
-------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.842926	valid_1's auc: 0.770715
[1000]	training's auc: 0.874774	valid_1's auc: 0.774532
[1500]	training's auc: 0.896607	valid_1's auc: 0.772278
Early stopping, best iteration is:
[1170]	training's auc: 0.882696	valid_1's auc: 0.77486
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.844161	valid_1's auc: 0.758139
[1000]	training's auc: 0.875512	valid_1's auc: 0.763042
[1500]	training's auc: 0.897401	valid_1's auc: 0.762979
Early stopping, best iteration is:
[1341]	training's auc: 0.891025	valid_1's auc: 0.764019
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.844765	valid_1's auc: 0.738737
[1000]	training's auc: 0.877674	valid_1's auc: 0.754196
[1500]	training's auc: 0.898784	va

In [46]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X_train_chuli[trn_idx],
                           label=y_train[trn_idx],
                           )
    val_data = lgb.Dataset(X_train_chuli[val_idx],
                           label=y_train[val_idx],
                           )

    param = {
        'max_depth': -1,
        'min_data_in_leaf': 2, 
        'objective':'binary',
        'bagging_fraction':0.999,
        'feature_fraction':0.999,
        'learning_rate': 0.005,
        "boosting": "gbdt",
        "bagging_freq": 5,
        "bagging_seed": 11,
        "metric": 'auc',
        "verbosity": -1
    }

    clf = lgb.train(param,
                    trn_data,
                    8000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 500)

    oof[val_idx] = clf.predict(X_train_chuli[val_idx],
                               num_iteration=clf.best_iteration)
    predictions += clf.predict(X_test_chuli, num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.879002	valid_1's auc: 0.790315
[1000]	training's auc: 0.932447	valid_1's auc: 0.802947
[1500]	training's auc: 0.961179	valid_1's auc: 0.807213
[2000]	training's auc: 0.978141	valid_1's auc: 0.809524
[2500]	training's auc: 0.987611	valid_1's auc: 0.810058
[3000]	training's auc: 0.993031	valid_1's auc: 0.810405
[3500]	training's auc: 0.99613	valid_1's auc: 0.810717
Early stopping, best iteration is:
[3269]	training's auc: 0.994952	valid_1's auc: 0.811215
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.880803	valid_1's auc: 0.782901
[1000]	training's auc: 0.932914	valid_1's auc: 0.797981
[1500]	training's auc: 0.961158	valid_1's auc: 0.802665
[2000]	training's auc: 0.977653	valid_1's auc: 0.805107
[2500]	training's auc: 0.987441	valid_1's auc: 0.807167
[3000]	training's auc: 0.993237	valid_1's auc: 0.807052
Early stopping, best iteration is:
[2555

In [47]:
print(len(predictions))
predictions[:4]

2712


array([0.87350113, 0.6612115 , 0.03153293, 0.91210489])

In [48]:
lgb_output = pd.DataFrame({"ID":df_test["ID"], "Pred":predictions})
lgb_output.to_csv('lgb_new.csv', index = False)