In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
import matplotlib

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
import codecs
import re
import copy
import collections
import csv

In [None]:
####read file

#read train.docs file
train_doc_file = pd.read_csv("train.docs", encoding = 'utf-8', sep='\t', header=None)
train_doc_file.columns = ['id', 'text']

#read train.nontopic-titles.queries file
train_query_file = pd.read_csv("train.nontopic-titles.queries", encoding = 'utf-8', sep='\t', header=None)
train_query_file.columns = ['id', 'text']


In [None]:
### import stopwords list
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")

In [None]:
# generate tokens list 
word_pattern = re.compile("^\w+$")
def get_text_counter(text):
    tokens = list()
    for el in text:
        tokens = tokens + WordPunctTokenizer().tokenize(PorterStemmer().stem(el))
    
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    return collections.Counter(tokens), len(tokens)

In [None]:
# get term frequency list 
def get_term_freq(tokens, size):
    abs_tf = np.array([el[1] for el in tokens])
    rel_tf = abs_tf/size
    index = np.array([el[0] for el in tokens])
    df = pd.DataFrame(data = np.array([abs_tf, rel_tf]).T, index = index, columns = (['term frequency', 'relative frequency']))
    df.index.name = 'term'
    return df

In [None]:
# get the term frequency list of train.docs
counter, size = get_text_counter(train_doc_file.text)

train_df = get_term_freq(counter.most_common(size), size)
train_df.to_csv("train_term_frequency.csv")

# get the term frequency list of train.nontopic-titles.queries
counter, size = get_text_counter(train_query_file.text)

train_df = get_term_freq(counter.most_common(size), size)
train_df.to_csv("train_query_term_frequency.csv")

In [None]:
## preprocessing

#remove "num", because "num" has the highest term frequency(45538) at the orinal file, 
#the second most frequent term only has a freq. of 3750

train_doc_file["text"] = train_doc_file['text'].str.replace('num', '')

In [None]:
### create DTM

 ## get DTM, weighted by tfidf
def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer(stop_words = esw) #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf


## get DTM, weighted by tfidf
def get_DTM_tf(file):
    train_count_vect = CountVectorizer(stop_words = esw) #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf




In [None]:
### create tfidf weighted DTM for the train.docs file
train_tfidf = get_DTM_tfidf(train_doc_file.text)
train_tfidf

In [None]:
###Generate the query vector
def get_QueryVector(queryFile, docFile=train_doc_file.text):
    train_count_vect = CountVectorizer(stop_words = esw) #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    query_vect = CountVectorizer( stop_words = esw, analyzer = "word", vocabulary = vocabulary)
    query_vect = query_vect.fit_transform(queryFile)
    return query_vect

In [None]:
### create query vector matrix for the train.nontopic-titles.queries file 
#query_vect = get_QueryVector(train_query_file.text, train_doc_file.text)
#query_vect