In [1]:
import pandas as pd

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [2]:
def basic_clean(string):
    string = string.lower()
    string = (unicodedata.normalize('NFKD', string)
                         .encode('ascii', 'ignore')
                         .decode('utf-8', 'ignore')
             )
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

In [3]:
def clean_html(string):
    string = re.sub(r'<[^>]*>', '', string)
    string = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", '', string)
    string = re.sub(r'\n', '', string)
    string = re.sub(r'\s\s', '', string)
    return string

In [4]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [5]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    return ' '.join(stems)

In [6]:
def lemma(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    return ' '.join(lemmas)

In [7]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')
    
    for word in extra_words:
        stopword_list.append(word)
    
    for word in exclude_words:
        stopword_list.remove(word)
        
    words = string.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [8]:
def prepare_readme_data(df,column):
    clean_tokens = (df[column].apply(clean_html)
                              .apply(basic_clean)
                              .apply(tokenize)
                              .apply(remove_stopwords)
                   )
    
    for token in clean_tokens:
        token = ' '.join(token).split()
    
    df['stemmed'] = clean_tokens.apply(stem)
    df['lemmatized'] = clean_tokens.apply(lemma)
    return df

In [9]:
def wrangle_data():
    data = pd.read_json('data.json')
    return prepare_readme_data(data, 'readme_contents')

In [10]:
data = pd.read_json('data.json')
data

Unnamed: 0,repo,language,readme_contents
0,rdpeng/ExData_Plotting1,,## Introduction\n\nThis assignment uses data f...
1,rdpeng/RepData_PeerAssessment1,,## Introduction\n\nIt is now possible to colle...
2,DataScienceSpecialization/courses,HTML,\n### Data Science Specialization\n\nThese are...
3,fighting41love/funNLP,Python,"<center>\n <img style=""border-radius: 0.312..."
4,magento/magento2,PHP,"\n<p align=""center"">\n<a href=""https://www.cod..."
...,...,...,...
485,rprokap/pset-9,JavaScript,# pset-9\n CREDITS SEQUENCE ...
486,konzy/mass_clone,Shell,# mass_clone\nThis is a shell script that will...
487,biter777/countries,Go,# countries\r\n\r\nCountries - ISO 3166 (ISO31...
488,sayantann11/all-classification-templetes-for-ML,Python,# all-classification-templetes-for-ML\nClassif...
