In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from spacy.tokens import DocBin
import spacy
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.pipeline import Pipeline
from itertools import compress
from sklearn.preprocessing import StandardScaler
%matplotlib notebook


In [None]:
# Enter correct paths here
train_path = '/content/drive/MyDrive/TechSoc Submission/train.csv'
test_path = '/content/drive/MyDrive/TechSoc Submission/test.csv'
stopwords_path = '/content/drive/MyDrive/TechSoc Submission/stopwords.txt'
save_path='/content/drive/MyDrive/TechSoc Submission/Final.csv'
model_path='/content/drive/MyDrive/TechSoc Submission/mymodel.joblib'

In [None]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
Content_scaler = preprocessing.StandardScaler(with_mean=False)
Nouns_scaler = preprocessing.StandardScaler(with_mean=False)
Ent_scaler = preprocessing.StandardScaler(with_mean=False)
others_scaler = preprocessing.StandardScaler(with_mean=False)
Content_Vectorizer = CountVectorizer(ngram_range=(1, 2),max_df=0.8,min_df=1)
Nouns_Vectorizer = CountVectorizer()
Ent_Vectorizer = CountVectorizer()

In [None]:
stopwords = []
file = open(stopwords_path, "r")
for line in file:
    word=line.strip()
    stopwords.append(stemmer.stem(word))

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
df = pd.read_csv(train_path)

In [None]:
def extra(df):
    docs = nlp.pipe(df.content,n_process=-1)
    df['Nouns']=0
    df['Ent']=0
    for i,t in enumerate(docs):
        ent = t.ents
        nouns = [word for word in t if word.pos_ == 'NOUN']
        ent =[word.text for word in ent]
        ent = ' '.join(ent)
        nouns =[word.text for word in nouns]
        nouns = ' '.join(nouns)
        df['Nouns'][i]=nouns
        df['Ent'][i]=ent
        if i%4000==0:
            print('#',end='')
    return df

In [None]:
def PreProcesser(df):
    print('Preprocessing start...')
    df = df.drop(columns = ['title','uid'])
    df['TolWords']= df['content'].apply(lambda x: len(x.split(' ')))
    df['TolSentance']=df['content'].apply(lambda x: len(x))
    df['AvgWordLen']= df['content'].apply(lambda x: np.mean([len(word)  for word in x.split(' ') ]))
    print('@',end='')
    df = extra(df)        
    print('@',end='')
    df['content']=df['content'].apply(lambda x: TreebankWordDetokenizer().detokenize( stemmer.stem(word) for word in tokenizer.tokenize(x) ))
    df['content']= df['content'].apply(lambda x:TreebankWordDetokenizer().detokenize(list(compress(x.split(), [word not in stopwords  for word in x.split()]))))
    print('#',end='')
    df['Nouns']=df['Nouns'].apply(lambda x: TreebankWordDetokenizer().detokenize( stemmer.stem(word) for word in tokenizer.tokenize(x) ))
    df['Ent']= df['Ent'].apply(lambda x:TreebankWordDetokenizer().detokenize(list(compress(x.split(), [word not in stopwords  for word in x.split()]))))
    df['Ent']=df['Ent'].apply(lambda x: TreebankWordDetokenizer().detokenize( stemmer.stem(word) for word in tokenizer.tokenize(x) ))
    print('#',end='')
    df['TolSentanceFin']=df['content'].apply(lambda x: len(x))
    df['AvgWordLenFin']= df['content'].apply(lambda x: np.mean([len(word)  for word in x.split(' ') ]))
    df['TolNouns']= df['Nouns'].apply(lambda x: len(x.split(' ')))
    print('#',end='')
    df['TolEnts']= df['Ent'].apply(lambda x: len(x.split(' ')))
    print('Preprocessing completed')
    return df

In [None]:
def transform(df,fit=False,save=False):
    print('Transformation Begins')
    try:
        o = df.drop(columns=['content','Nouns','Ent','target_ind']).to_numpy()
    except:
        o = df.drop(columns=['content','Nouns','Ent']).to_numpy()
    else:
        o = df.drop(columns=['content','Nouns','Ent','target_ind']).to_numpy()
    if fit:
            Content_Vectorizer.fit(df.content)
            Nouns_Vectorizer.fit(df.Nouns)
            Ent_Vectorizer.fit(df.Ent)
            print('Fitted Vectorizers')
    c = Content_Vectorizer.transform(df['content'])
    n = Nouns_Vectorizer.transform(df['Nouns'])
    e = Ent_Vectorizer.transform(df['Ent'])
    print('Transformed Vectorizers')
    if fit:
            Content_scaler.fit(c)
            Nouns_scaler.fit(n)
            Ent_scaler.fit(e)
            others_scaler.fit(o)
            print('Fitted Scalers')
    if save:
        from joblib import dump,load
        dump(c,"./data/c.joblib")
        dump(o,"./data/o.joblib")
        dump(n,"./data/n.joblib")
        dump(e,"./data/e.joblib")
    c = Content_scaler.transform(c)
    n = Nouns_scaler.transform(n)
    e = Ent_scaler.transform(e)
    o = others_scaler.transform(o)
    print(o.shape)
    print('Transformed Scalers')
    print('Transformation Completed')
    return (o,c,n,e)

In [None]:
def combine(o,c,n,e):
    print('Combiner Begins')
    from scipy.sparse import hstack
    X=hstack((o,n,e))
    from scipy.sparse import csr_matrix
    X=csr_matrix(X)
    return X

In [None]:
def Pipe(df,fit=False,save=False):
    df=PreProcesser(df)
    (o,c,n,e)=transform(df,fit)
    
    try:
        Y=df.target_ind
    except:
        Y=0
    else:
        Y=df.target_ind
    X = combine(o,c,n,e)
    if save:
        from joblib import dump,load
        
        dump(X,"./data/X.joblib")
        dump(Y,"./data/Y.joblib")
    return (X,Y)

In [None]:
def Process(df,fit=False,n_jobs=4):
    print('Using %d jobs' %n_jobs)
    df_split = np.array_split(df, n_jobs)
    from multiprocess import Pool
    pool = Pool(n_jobs)
    import functools
    df = pd.concat(pool.map( PreProcesser , df_split))
    print('Preprocessing completed')
    pool.close()
    pool.join()
    return df

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
(X,Y)=Pipe(df,fit=True,save=False)

Preprocessing start...
@#########@###Preprocessing completed
Transformation Begins
Fitted Vectorizers
Transformed Vectorizers
Fitted Scalers
(35112, 7)
Transformed Scalers
Transformation Completed
Selector Begins


In [None]:
X.shape

(35112, 52911)

In [None]:
Y.shape

(35112,)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
RF_ = RandomForestClassifier(random_state=0,max_depth = 50,verbose=1,n_jobs=4,n_estimators=100)
selector = SelectFromModel(estimator=RF_).fit(X[:28000],Y[:28000])
ind=selector.get_support()
X_ = X[:,ind]

In [None]:
X_.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_ = RandomForestClassifier(random_state=0,max_depth = 200,verbose=2,n_jobs=-1,n_estimators=500)
RF_.fit(X_[:28000],Y[:28000])

In [None]:
RF_.score(X[28000:],Y[28000:])

In [None]:
elapsed = time.time() - t
print(elapsed)

In [None]:
from dill import dump_session
dump_session(model_path)

In [None]:
df_pred = pd.read_csv(test_path)

In [None]:
(X_pred,_)=Pipe(df_pred,fit=False)

In [None]:
y=RF_.predict(X_pred)

In [None]:
df_fin = pd.DataFrame(y,columns=['target_ind'])
df_fin.index.name='uid'
df_fin.to_csv(save_path,index=True)