# Project Elective (IIIT-B)
### Part 1 : This notebook deals with basics of cleaning and preprocessing data .


#### Project mentor: Prof. Manish Gupta, Onkar Hoysala 

In [None]:
"""In case of jupyter notebook , used this if there is no pre-existing environment"""
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn

In [None]:
import pandas as pd

In [None]:
nptel_raw_data = pd.read_csv('nptel_final.csv')

In [None]:
nptel_raw_data

In [None]:
!pip3 install nltk
# Explicitely downloaded stopwords using: python3 -m nltk.downloader stopwords

## Data Preprocessing

### Stemming
#### With stemming, words are reduced to their word stems. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word.
#### Can be improved further by lemmatization

In [None]:
import nltk
import string
import re
porter_stemmer = nltk.stem.porter.PorterStemmer()
"""
This function is basically used for stemming
"""
def porter_tokenizer(text, stemmer=porter_stemmer):
    tokens = nltk.wordpunct_tokenize(text)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct

In [None]:
vtt_content=nptel_raw_data.vtt_Content.tolist()
stemmed_vtt_content=[] 
#stemming each word for each row of vtt_description of videos
for eachcontent in vtt_content:
    eachcontent=eachcontent.lower()
    #In case python 2, or if need utf-8 decoding error encountered,use: eachcontent=eachcontent.lower().decode('utf-8')
    tokenized=porter_tokenizer(eachcontent)
    preprocessed=' '.join(tokenized)
    preprocessed=preprocessed.encode('ascii','ignore')
    stemmed_vtt_content.append(preprocessed)


#### Stopwords are removed from stemmed data to get more relevant data

In [None]:
from nltk.corpus import stopwords
"""converting stop words to ascii to prevent unicode error 
and adding some new words that intuitively could be stop words here,afetr looking at the raw data"""
stop_words_ascii=[]
for each in stopwords.words('english'):
    stop_words_ascii.append(each)
    #python2 : stop_words_ascii.append(each.encode('ascii','ignore'))

stop_words_ascii.extend(['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
stop_words_ascii.extend(['ah','uh','the','and','so','is','ok','um','ok','ha','language:','en'])


In [None]:
#removing stop words from stemmed vtt data

filtered_words_list=[]
i=0
for i in range(len(stemmed_vtt_content)):
    filtered_words=[]
    #use simple stemmed_vtt_content[i] in python2 as in python2 we have already decoded above
    for item in stemmed_vtt_content[i].decode("utf-8").split(" "):
        if item not in stop_words_ascii:
            filtered_words.append(item)         
    str1=" ".join(filtered_words)
   
    filtered_words_list.append(str1)
#print("FINAL_____________________________________________________________________",filtered_words_list)

In [None]:
#Verifying if words are stemmed and cleaned off the stop words
filtered_words_list[0]

In [None]:
def check_frequency_of_relevant_words(filtered_words_list):
    import collections
    for i in range(len(filtered_words_list)):
    #for list_1 in filtered_words_list[i].split(" "):
        counter = collections.Counter(filtered_words_list[i].split(" "))
        print(counter.most_common())

check_frequency_of_relevant_words(filtered_words_list)

## Bag of Words : TFIDF Application 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = stop_words_ascii)

In [None]:
tfidf_matrix =  tf.fit_transform(filtered_words_list)
feature_names = tf.get_feature_names() 

In [None]:
len(feature_names)

In [None]:
tfidf_matrix

In [None]:
from pandas import DataFrame
doc_id=0
#showing 50 most important occuring words,as processed by tf-idf
tfidf_words_list=[]
dense=tfidf_matrix.todense()
for i in range(len(filtered_words_list)):
    top_n=[]
    perVid_Transcript=dense[i].tolist()[0]
    phrase_scores= [pair for pair in zip(range(0, len(perVid_Transcript)), perVid_Transcript) if pair[1] > 0]
    sorted_phrase_scores=sorted(phrase_scores, key=lambda t: t[1] * -1)
    for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:50]:
        top_n.append(phrase.encode('ascii','ignore').decode("utf-8"))
        #for python 2 use: top_n.append(phrase.encode('ascii','ignore')
        #print('{0} {1: <50} {2}'.format(i,phrase, score))
    tfidf_words_list.append(top_n)
    #print('\n Document {0} :{1}'.format(i,top_n))
#print('Final list of 50 words of all docs in one list',tfidf_words_list)

In [None]:
#adding a new column in our dataframe of top bag of words with each vtt
nptel_raw_data['TF-IDF_top_words']=tfidf_words_list
nptel_raw_data
nptel_raw_data.to_csv('nptel_tfidf_labelled.csv')
