In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

import json
import requests

import wrangle
import prepare as prep

from env import github_token, github_username

from wordcloud import WordCloud

from sklearn.model_selection import train_test_split

import re
import unicodedata
import nltk

## Acquire and Prep

In [2]:
df = pd.read_json("data.json")
df.head()

Unnamed: 0,repo,language,readme_contents
0,mcastrolab/Brazil-Covid19-e0-change,R,# Reduction in life expectancy in Brazil after...
1,jschoeley/de0anim,R,# Animated annual changes in life-expectancy\n...
2,sychi77/Thoracic_Surgery_Patient_Survival,Jupyter Notebook,# Thoracic Surgery for Lung Cancer Data Set\n ...
3,ashtad63/HackerRank-Data-Scientist-Hiring-Test,Jupyter Notebook,# HackerRank Data Scientist Hiring Test: Predi...
4,OxfordDemSci/ex2020,R,"<p align=""center"">\n <img src=""https://github..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             166 non-null    object
 1   language         157 non-null    object
 2   readme_contents  166 non-null    object
dtypes: object(3)
memory usage: 4.0+ KB


In [4]:
df_copy = df.copy()

In [None]:
df = prep.prep_data(df)
df.head()

In [None]:
df.language.value_counts()

In [None]:
df_all_languages = prep.prep_data(df_copy, keep_top_languages=False)

In [None]:
df_all_languages.language.value_counts()

## Split Data

In [None]:
def split_data(df):
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
    return train, validate, test

In [None]:
train, validate, test = split_data(df)
train.shape, validate.shape, test.shape

In [None]:
train_languages, validate_languages, test_languages = split_data(df_all_languages)
train_languages.shape

## Explore 
#### Pre-Processing

In [None]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(train, "language")

In [None]:
def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


In [None]:
## Create lists of words for each language category
other = clean(' '.join(train[train.language == 'other'].original))
python = clean(' '.join(train[train.language == 'Python'].original))
r = clean(' '.join(train[train.language == 'R'].original))
html = clean(' '.join(train[train.language == 'HTML'].original))
all_words = clean(' '.join(train.original))

In [None]:
## Transform lists into series
other_freq = pd.Series(other).value_counts()
python_freq = pd.Series(python).value_counts()
r_freq = pd.Series(r).value_counts()
html_freq = pd.Series(html).value_counts()
all_freq = pd.Series(all_words).value_counts()

other_freq.head()

In [None]:
#Create a word_counts data frame we can work with

word_counts = (pd.concat([all_freq, other_freq, python_freq, r_freq, html_freq], axis=1, sort=True)
                .set_axis(['all', 'other', 'python', 'r', 'html'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.head()

### Answer questions about word frequency:

**What are the most frequently occuring words?**

In [None]:
word_counts.sort_values(by='all', ascending=False).head(20)

**Are there any words that uniquely identify one of the coding languages?**

In [None]:
pd.concat([word_counts[word_counts.other == 0].sort_values(by='other').tail(6),
           word_counts[word_counts.python == 0].sort_values(by='python').tail(6),
           word_counts[word_counts.r == 0].sort_values(by='r').tail(6),
           word_counts[word_counts.html == 0].sort_values(by='html').tail(6)])

In [None]:
# figure out the percentage of spam vs ham
(word_counts
 .assign(p_other=word_counts.other / word_counts['all'],
         p_python=word_counts.python / word_counts['all'],
         p_r=word_counts.r / word_counts['all'],
         p_html=word_counts.html / word_counts['all'])
 .sort_values(by='all')
 [['p_other', 'p_python', 'p_r', 'p_html']]
 .tail(20)
 .sort_values('p_other')
 .plot.barh(stacked=True))

plt.title('Proportion of Language Word Frequency for the 20 most common words')


### Create and Visualize Bigrams

In [None]:
# Top 20 other
top_20_other_bigrams = (pd.Series(nltk.ngrams(other, 2))
                      .value_counts()
                      .head(20))

top_20_other_bigrams.head()

In [None]:
# Top 20 python 
top_20_python_bigrams = (pd.Series(nltk.ngrams(python, 2))
                      .value_counts()
                      .head(20))

top_20_python_bigrams.head()

In [None]:
# Top 20 r
top_20_r_bigrams = (pd.Series(nltk.ngrams(r, 2))
                      .value_counts()
                      .head(20))

top_20_r_bigrams.head()

In [None]:
# Top 20 html
top_20_html_bigrams = (pd.Series(nltk.ngrams(html, 2))
                      .value_counts()
                      .head(20))

top_20_html_bigrams.head()

In [None]:
## Plot Top 20 Other

top_20_other_bigrams.sort_values().plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring other bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_other_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)


In [None]:
## Create a wordcloud 
data = {k[0] + ' ' + k[1]: v for k, v in top_20_other_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
## Plot Top 20 python

top_20_python_bigrams.sort_values().plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring jupyter bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_python_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)


In [None]:
## Create a wordcloud 
data = {k[0] + ' ' + k[1]: v for k, v in top_20_python_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
## Plot Top 20 r

top_20_r_bigrams.sort_values().plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring r bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_r_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)

In [None]:
## Create a wordcloud 
data = {k[0] + ' ' + k[1]: v for k, v in top_20_r_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
## Plot Top 20 html

top_20_html_bigrams.sort_values().plot.barh(color='pink', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring html bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_html_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)

In [None]:
## Create a wordcloud 
data = {k[0] + ' ' + k[1]: v for k, v in top_20_html_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

#### Add Other Features to Explore

In [None]:
#Create a function to get the character count of each doc
def get_char_count(string):
    """
    This function will take in a string and return the number of characters in it.
    """
    
    return len(string)

In [None]:
def get_word_count(string):
    """
    This function will take in a string and return the number of words in that string.
    This function will include repeat words.
    """
    
    #Create a list of words separated by a space
    words = string.split()
    
    return len(words)

In [None]:
def get_unique_words(string):
    """
    This function will take in a string and return the number of unique words in that string.
    """
    
    words = string.split()
    words = set(words)
    
    return len(words)

In [None]:
def get_sentence_count(string):
    """
    This function will take in a string and return the number of sentences in that string.
    """
    
    sentences = nltk.sent_tokenize(string)
    
    return len(sentences)

In [None]:
train['sentence_count'] = train.original.apply(get_unique_words)
train.head()

In [None]:
train[train['char_count'] <=0]

In [None]:
sns.barplot(data=train, y='word_count', x='language')

In [None]:
sns.barplot(data=train, y='char_count', x='language')