# Pre-processing

In [1]:
from pprint import pprint
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata

import matplotlib.pyplot as plt
import seaborn as sns

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import time
import acquire #basic_clean, lemmatize
# np.random.seed("123")

**get data**

In [None]:
python_df = acquire.get_data(1)
# time.sleep(30)
java_sc_df = acquire.get_data(2)

In [None]:
python_df.head(2)

In [None]:
java_sc_df.head(2)

In [None]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

In [None]:
# cleaned_row = []
# for i in df.readme_content.values:
#     cleaned_row.append(clean(i))
# df = df.assign(cleaned_content=cleaned_row)

In [None]:
# clean the content separated my language as on big string
python_words = clean(" ".join(python_df.readme_content)).split()
javaScript_words = clean(" ".join(java_sc_df.readme_content)).split()
# make a data frame of the words
words_py = pd.DataFrame(python_words, columns=["python_words"])
words_js = pd.DataFrame(javaScript_words, columns=["javascript_words"])

In [None]:
words_js

In [None]:
words_py

## Term Frequency (TF)

In [None]:
# the count of the number of occurances of each word.
py_word_df = pd.DataFrame(words_py.python_words.value_counts().index, columns=["word"]).assign(raw_count_py= words_py.python_words.value_counts().values)
js_word_df = pd.DataFrame(words_js.javascript_words.value_counts().index, columns=["word"]).assign(raw_count_js= words_js.javascript_words.value_counts().values)

# The number of times each word appears divided by the total number of words.
frequency_py = py_word_df.raw_count_py / len(words_py)
frequency_js = js_word_df.raw_count_js / len(words_js)

# The frequency of each word divided by the maximum frequency. 
augmented_frequency_py = frequency_py / frequency_py.max()
augmented_frequency_js = frequency_js / frequency_js.max()

# add to the dataframe
py_word_df["frequency_py"] = frequency_py
py_word_df["augmented_frequency_py"] = augmented_frequency_py
js_word_df["frequency_js"] = frequency_js
js_word_df["augmented_frequency_js"] = augmented_frequency_js

In [None]:
py_word_df.head()

In [None]:
js_word_df.head()

## Inverse Document Frequency (IDF)

In [None]:
python_df.head(2), java_sc_df.head(2)

In [None]:
def prep_data(text:str, more_stopwords=[]):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer() # lemmitizer object
    # add more stop words to the original dictionary stop words from the english language
    stopwords = nltk.corpus.stopwords.words('english') + more_stopwords
    # normalize the string
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    # remove not letters of numbers
    words = re.sub(r'[^\w\s]', '', text).split()
    # lemmatize the string
    lemma =  [wnl.lemmatize(word) for word in words if word not in stopwords]
    return lemma

In [None]:

# our 3 example documents
documents = {
    'python': " ".join(python_df.readme_content),
    'javaScript': " ".join(java_sc_df.readme_content),
}

for doc, topic in documents.items():
    # clean and lemmatize the data and join them back by space
    documents[doc] = " ".join(prep_data(topic))
    print('Cleaning and lemmatizing...')

In [None]:
# A simple way to calculate idf for demonstration. Note that this
# function relies on the globally defined documents variable.
def idf(word):            
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / n_occurences + 1

# Get a list of the unique words
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))

In [None]:
def prep_readmes(df, col="readme_content"):
    """
    Takes in the dataframe and the column name that contains the corpus data, creates a column of cleaned data, then uses that 
    to create a column without stopwords that is lemmatized, performs a train-validate-test split, performs an x-y split, and
    returns x and y train, x and y validate, and x and y test.
    """
    # Create the cleaned column

    cleaned_row = []
    for i in df.readme_content.values:
        cleaned_row.append(clean(i))
    df = df.assign(cleaned_content=cleaned_row)
#     df['cleaned'] = df[col].apply(lambda x: clean(x))
    df['lemmatized'] = df['cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))
    
    # Split the dataframe (70/15/15)
    train, validate, test = split_readmes(df)
    
#     # perform x-y split
#     x_train, y_train = train.drop(columns=('language')), train.language
#     x_validate, y_validate = validate.drop(columns=('language')), validate.language
#     x_test, y_test = test.drop(columns=('language')), test.language
    
    return train, validate, test

In [None]:
acquire_readmes()

In [None]:
import wrangle

In [None]:
train, validate, test = wrangle.prep_readmes(python_df)

In [None]:
train

In [2]:
python_df = acquire.get_data(1)
# time.sleep(30)
java_sc_df = acquire.get_data(2)

In [5]:
df = pd.concat([python_df, java_sc_df],axis=0)
df

Unnamed: 0.1,Unnamed: 0,repo_name,url,language,readme_content
0,0,jackfrued/Python-100-Days,https://github.com/jackfrued/Python-100-Days/b...,Python,Python - 100天从新手到大师作者：骆昊说明：从项目上线到获得8w+星标以来，一直收...
1,1,donnemartin/system-design-primer,https://github.com/donnemartin/system-design-p...,Python,English ∙ 日本語 ∙ 简体中文 ∙ 繁體中文 | العَرَبِيَّة‎ ∙ ...
2,2,Significant-Gravitas/Auto-GPT,https://github.com/Significant-Gravitas/Auto-G...,Python,Auto-GPT: An Autonomous GPT-4 Experiment💡 Get ...
3,3,AUTOMATIC1111/stable-diffusion-webui,https://github.com/AUTOMATIC1111/stable-diffus...,Python,Stable Diffusion web UIA browser interface bas...
4,4,yandex-praktikum/backend_test_homework,https://github.com/yandex-praktikum/backend_te...,Python,backend_test_homework
...,...,...,...,...,...
83,83,learn-co-students/javascript-strings-lab-js-ap...,https://github.com/learn-co-students/javascrip...,JavaScript,"JavaScript Strings LabOverviewIn this lab, we'..."
84,84,openlayers/openlayers,https://github.com/openlayers/openlayers/blob/...,JavaScript,"OpenLayersOpenLayers is a high-performance, fe..."
85,85,learn-co-curriculum/phase-1-destructuring-assi...,https://github.com/learn-co-curriculum/phase-1...,JavaScript,Destructuring AssignmentLearning GoalsUse dest...
86,86,airbnb/lottie-web,https://github.com/airbnb/lottie-web/blob/mast...,JavaScript,"Lottie for Web, Android, iOS, React Native, an..."


In [6]:
df.to_csv("full_py_and_js_data.csv", mode="w")

In [2]:
import wrangle

In [3]:
wrangle.acquire_readmes()

returning python and Java-script data


Unnamed: 0.2,Unnamed: 0,repo_name,url,language,readme_content,Unnamed: 0.1,repo_name.1,url.1,language.1,readme_content.1
0,0,jackfrued/Python-100-Days,https://github.com/jackfrued/Python-100-Days/b...,Python,Python - 100天从新手到大师作者：骆昊说明：从项目上线到获得8w+星标以来，一直收...,0.0,nightscout/cgm-remote-monitor,https://github.com/nightscout/cgm-remote-monit...,JavaScript,Nightscout Web Monitor (a.k.a. cgm-remote-moni...
1,1,donnemartin/system-design-primer,https://github.com/donnemartin/system-design-p...,Python,English ∙ 日本語 ∙ 简体中文 ∙ 繁體中文 | العَرَبِيَّة‎ ∙ ...,1.0,trekhleb/javascript-algorithms,https://github.com/trekhleb/javascript-algorit...,JavaScript,جافا سكريبت خوارزميات وهياكل البياناتتحتوي هذ...
2,2,Significant-Gravitas/Auto-GPT,https://github.com/Significant-Gravitas/Auto-G...,Python,Auto-GPT: An Autonomous GPT-4 Experiment💡 Get ...,2.0,angular/angular.js,https://github.com/angular/angular.js/blob/mas...,JavaScript,Using AngularJS with the Closure CompilerThe C...
3,3,AUTOMATIC1111/stable-diffusion-webui,https://github.com/AUTOMATIC1111/stable-diffus...,Python,Stable Diffusion web UIA browser interface bas...,3.0,TheOdinProject/javascript-exercises,https://github.com/TheOdinProject/javascript-e...,JavaScript,JavaScript ExercisesThese JavaScript exercises...
4,4,yandex-praktikum/backend_test_homework,https://github.com/yandex-praktikum/backend_te...,Python,backend_test_homework,4.0,jquery/jquery,https://github.com/jquery/jquery/blob/main/REA...,JavaScript,jQuery — New Wave JavaScriptMeetings are curre...
...,...,...,...,...,...,...,...,...,...,...
85,85,Ebazhanov/linkedin-skill-assessments-quizzes,https://github.com/Ebazhanov/linkedin-skill-as...,Python,Linkedin Skill assessments - Answers⚠️ DISCLAI...,85.0,learn-co-curriculum/phase-1-destructuring-assi...,https://github.com/learn-co-curriculum/phase-1...,JavaScript,Destructuring AssignmentLearning GoalsUse dest...
86,86,apachecn/ailearning,https://github.com/apachecn/ailearning/blob/ma...,Python,AI learning协议：...,86.0,airbnb/lottie-web,https://github.com/airbnb/lottie-web/blob/mast...,JavaScript,"Lottie for Web, Android, iOS, React Native, an..."
87,87,hankcs/HanLP,https://github.com/hankcs/HanLP/blob/doc-zh/RE...,Python,HanLP: Han Language Processing ...,87.0,bloominstituteoftechnology/node-db3-project,https://github.com/bloominstituteoftechnology/...,JavaScript,Node DB3 Project Starter CodeTask 1: Project S...
88,88,langchain-ai/langchain,https://github.com/langchain-ai/langchain/blob...,Python,🦜️🔗 LangChain⚡ Building applications with LLMs...,,,,,
