# Hypothesis testing

In [22]:
from pprint import pprint
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata

import matplotlib.pyplot as plt
import seaborn as sns

import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import time
import acquire #basic_clean, lemmatize
# np.random.seed("123")

**get data**

In [23]:
python_df = acquire.get_data(1)
# time.sleep(30)
java_sc_df = acquire.get_data(2)

In [24]:
python_df.head(2)

Unnamed: 0.1,Unnamed: 0,repo_name,url,language,readme_content
0,0,jackfrued/Python-100-Days,https://github.com/jackfrued/Python-100-Days/b...,Python,Python - 100天从新手到大师作者：骆昊说明：从项目上线到获得8w+星标以来，一直收...
1,1,donnemartin/system-design-primer,https://github.com/donnemartin/system-design-p...,Python,English ∙ 日本語 ∙ 简体中文 ∙ 繁體中文 | العَرَبِيَّة‎ ∙ ...


In [25]:
java_sc_df.head(2)

Unnamed: 0.1,Unnamed: 0,repo_name,url,language,readme_content
0,0,nightscout/cgm-remote-monitor,https://github.com/nightscout/cgm-remote-monit...,JavaScript,Nightscout Web Monitor (a.k.a. cgm-remote-moni...
1,1,trekhleb/javascript-algorithms,https://github.com/trekhleb/javascript-algorit...,JavaScript,جافا سكريبت خوارزميات وهياكل البياناتتحتوي هذ...


In [8]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

In [9]:
# cleaned_row = []
# for i in df.readme_content.values:
#     cleaned_row.append(clean(i))
# df = df.assign(cleaned_content=cleaned_row)

In [10]:
# clean the content separated my language as on big string
python_words = clean(" ".join(python_df.readme_content)).split()
javaScript_words = clean(" ".join(java_sc_df.readme_content)).split()
# make a data frame of the words
words_py = pd.DataFrame(python_words, columns=["python_words"])
words_js = pd.DataFrame(javaScript_words, columns=["javascript_words"])

In [11]:
words_js

Unnamed: 0,javascript_words
0,nightscout
1,web
2,monitor
3,aka
4,cgmremotemonitor
...,...
115186,most
115187,cheese
115188,measured
115189,in


In [12]:
words_py

Unnamed: 0,python_words
0,python
1,1008w15pythoncore50coursespythonbpythonuppytho...
2,python
3,java
4,go
...,...
100813,document
100814,20230815
100815,550
100816,510chatgpt4xx


## Term Frequency (TF)

In [13]:
# the count of the number of occurances of each word.
py_word_df = pd.DataFrame(words_py.python_words.value_counts().index, columns=["word"]).assign(raw_count_py= words_py.python_words.value_counts().values)
js_word_df = pd.DataFrame(words_js.javascript_words.value_counts().index, columns=["word"]).assign(raw_count_js= words_js.javascript_words.value_counts().values)

# The number of times each word appears divided by the total number of words.
frequency_py = py_word_df.raw_count_py / len(words_py)
frequency_js = js_word_df.raw_count_js / len(words_js)

# The frequency of each word divided by the maximum frequency. 
augmented_frequency_py = frequency_py / frequency_py.max()
augmented_frequency_js = frequency_js / frequency_js.max()

# add to the dataframe
py_word_df["frequency_py"] = frequency_py
py_word_df["augmented_frequency_py"] = augmented_frequency_py
js_word_df["frequency_js"] = frequency_js
js_word_df["augmented_frequency_js"] = augmented_frequency_js

In [14]:
py_word_df.head()

Unnamed: 0,word,raw_count_py,frequency_py,augmented_frequency_py
0,and,2921,0.028973,1.0
1,the,2428,0.024083,0.831222
2,to,2074,0.020572,0.710031
3,for,1950,0.019342,0.66758
4,a,1729,0.01715,0.591921


In [15]:
js_word_df.head()

Unnamed: 0,word,raw_count_js,frequency_js,augmented_frequency_js
0,the,6424,0.055768,1.0
1,to,4086,0.035472,0.636052
2,a,2706,0.023491,0.421233
3,is,2284,0.019828,0.355542
4,of,2081,0.018066,0.323941


## Inverse Document Frequency (IDF)

In [16]:
python_df.head(2), java_sc_df.head(2)

(   Unnamed: 0                         repo_name  \
 0           0         jackfrued/Python-100-Days   
 1           1  donnemartin/system-design-primer   
 
                                                  url language  \
 0  https://github.com/jackfrued/Python-100-Days/b...   Python   
 1  https://github.com/donnemartin/system-design-p...   Python   
 
                                       readme_content  
 0  Python - 100天从新手到大师作者：骆昊说明：从项目上线到获得8w+星标以来，一直收...  
 1  English ∙ 日本語 ∙ 简体中文 ∙ 繁體中文 | العَرَبِيَّة‎ ∙ ...  ,
    Unnamed: 0                       repo_name  \
 0           0   nightscout/cgm-remote-monitor   
 1           1  trekhleb/javascript-algorithms   
 
                                                  url    language  \
 0  https://github.com/nightscout/cgm-remote-monit...  JavaScript   
 1  https://github.com/trekhleb/javascript-algorit...  JavaScript   
 
                                       readme_content  
 0  Nightscout Web Monitor (a.k.a. cgm-remote-moni...  


In [17]:
def prep_data(text:str, more_stopwords=[]):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer() # lemmitizer object
    # add more stop words to the original dictionary stop words from the english language
    stopwords = nltk.corpus.stopwords.words('english') + more_stopwords
    # normalize the string
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower())
    # remove not letters of numbers
    words = re.sub(r'[^\w\s]', '', text).split()
    # lemmatize the string
    lemma =  [wnl.lemmatize(word) for word in words if word not in stopwords]
    return lemma

In [18]:

# our 3 example documents
documents = {
    'python': " ".join(python_df.readme_content),
    'javaScript': " ".join(java_sc_df.readme_content),
}

for doc, topic in documents.items():
    # clean and lemmatize the data and join them back by space
    documents[doc] = " ".join(prep_data(topic))
    print('Cleaning and lemmatizing...')

Cleaning and lemmatizing...
Cleaning and lemmatizing...


In [19]:
# A simple way to calculate idf for demonstration. Note that this
# function relies on the globally defined documents variable.
def idf(word):            
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / n_occurences + 1

# Get a list of the unique words
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))

Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
platen,3.0
italso,3.0
susansee,3.0
consolelogexpected,3.0
solutionif,3.0
