In [172]:

import os
import pandas as pd

from functools import reduce

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np  
import re  
import nltk 
 
import pickle  
from nltk.corpus import stopwords

In [2]:

ROOT_DIR = r'/Users/shabhushan/Desktop/python/python-code/dataset/notracking/participants'
TRAIN_LABELS = os.path.join(ROOT_DIR, r'train', r'labels', r'labels.csv')
TRAIN_TEXT = os.path.join(ROOT_DIR, r'train', r'extracted_data', r'extract_combined.csv')
TEST_TEXT = os.path.join(ROOT_DIR, r'test', r'extracted_data', r'extract_combined.csv')


In [3]:

# read in training and testing data
# one dataframe for labels another for text features
train_labels_df = pd.read_csv(TRAIN_LABELS, usecols=['document_name','is_fitara'])
train_text_df = pd.read_csv(TRAIN_TEXT)
test_df = pd.read_csv(TEST_TEXT)

# combine labels with text features
train_df = pd.merge(
    train_labels_df, 
    train_text_df, 
    on='document_name', 
    how='inner'
)

# remove dataframes that are no longer needed from memory 
del train_labels_df
del train_text_df


Since Positive and Negative classes are size 71% and 29% respectively. Hence, no severe class imbalance.

In [4]:

# confirm class distribution
# is_fitara - yes: ~29%; no: ~71%
train_df['is_fitara'].value_counts(normalize=True)


No     0.713089
Yes    0.286911
Name: is_fitara, dtype: float64

In [5]:
pd.read_csv(TRAIN_LABELS).isnull().sum()

solicitation_id                77
contract_award_number         824
document_name                   0
is_fitara                       0
contains_statement_of_work      0
dtype: int64

In [54]:
def get_set_from_word_list(lst):
    temp_set_list = [set(nltk.word_tokenize(words)) for words in lst]

    return reduce(lambda x, y: {*x, *y}, temp_set_list)

In [89]:
#set_no = get_set_from_word_list(train_df_temp)
def get_word_frequency(df):
    tokenized_words = [nltk.word_tokenize(words) for words in df]
    words_list = reduce(lambda x, y: [*x, *y], tokenized_words)

    vectorizer = CountVectorizer(stop_words='english')
    vectorizer.fit_transform(words_list)

    return pd.DataFrame(vectorizer.vocabulary_.items(), columns=['Text', 'Frequency']).sort_values(by='Frequency', ascending=False)

In [181]:
def get_tf_idf(df):
    vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
    X = vectorizer.fit_transform(df)

    return pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())

In [190]:
#tokenized_words = [nltk.word_tokenize(words) for words in train_df_temp]
#words_list = reduce(lambda x, y: [*x, *y], tokenized_words)

#vectorizer = CountVectorizer(stop_words='english')
#vectorizer.fit_transform(words_list)

#pd.DataFrame(vectorizer.vocabulary_.items(), columns=['Text', 'Frequency']).sort_values(by='Frequency', ascending=False)

In [193]:
train_df_no = train_df[train_df.is_fitara == 'No']
train_df_yes = train_df[train_df.is_fitara == 'Yes']

In [194]:
# Get 15% of total Records for Ablation
ablation = 0.15
train_df_no_ablation = train_df_no.loc[0:int(len(train_df_no) * ablation), 'text']
train_df_yes_ablation = train_df_yes.loc[0:int(len(train_df_yes) * ablation), 'text']

In [192]:
get_tf_idf(train_df_no_ablation)

Unnamed: 0,00,000,001,00100514,00101600,0011995,0012016,002,003,00382903371679,...,ﬂexible,ﬂexlble,ﬂie,ﬂoor,ﬂoors,ﬂow,ﬂown,ﬂows,ﬂu,ﬂuid
0,0.005505,0.046179,0.001945,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.029788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.025835,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.000000,0.013777,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.000000,0.018160,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
