In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from acquire import *
from prepare_nlp_josh import *
import env
import json
from requests import get
from json.decoder import JSONDecodeError
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
#You can also call the Porter by nltk.porter.PorterStemmer
from nltk.stem.snowball import EnglishStemmer


from nltk.stem import WordNetLemmatizer
from nltk.book import *
from nltk.text import Text


import env

#These download may need to be run for the NLTK library:

#nltk.download('omw-1.4')
#nltk.download("punkt")
#nltk.download("stopwords")
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
#nltk.download("maxent_ne_chunker")
#nltk.download("words")
#nltk.download("book") #big download

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
links = get_links()

In [3]:
#When you get new links, delete this line.
#links.href = links.href.str[1:]

In [4]:
links.head()

Unnamed: 0,href
0,/learn-co-students/simple-blackjack-cli-prework
1,/rocketacademy/basics-blackjack
2,/datamllab/rlcard
3,/learn-co-students/simple-blackjack-cli-001-pr...
4,/cocos-creator/tutorial-blackjack-deprecated


In [5]:
#Run this to get new data
df = get_repos(links.href)

In [6]:
#Run to save the data
#df.to_csv("origional_data.csv", index=False)

In [7]:
df.dropna(inplace=True)

In [8]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,/learn-co-students/simple-blackjack-cli-prework,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
1,/rocketacademy/basics-blackjack,HTML,# Rocket Academy Coding Basics: Blackjack\n
2,/datamllab/rlcard,Python,# RLCard: A Toolkit for Reinforcement Learning...
3,/learn-co-students/simple-blackjack-cli-001-pr...,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
4,/cocos-creator/tutorial-blackjack-deprecated,JavaScript,# 21点游戏 - Cocos Creator 制造\n\n「21点游戏」是 Cocos C...


In [9]:
df['readme_clean'] = df['readme_contents'].apply(basic_clean)

In [10]:
df['readme_clean'].head()

0     simplified blackjack  procedural\n\n learning...
1             rocket academy coding basics blackjack\n
2     rlcard a toolkit for reinforcement learning i...
3     simplified blackjack  procedural\n\n learning...
4     21  cocos creator \n\n21 cocos creator  demo ...
Name: readme_clean, dtype: object

### Tokenize words

In [11]:
df['readme_clean'] = df['readme_clean'].apply(tokenized, tokenize_tool=2)

In [12]:
df['readme_clean'].head()

0    simplified blackjack procedural learning goals...
1               rocket academy coding basics blackjack
2    rlcard a toolkit for reinforcement learning in...
3    simplified blackjack procedural learning goals...
4    21 cocos creator 21 cocos creator demo cocos c...
Name: readme_clean, dtype: object

### Lemm words

In [13]:
df['readme_lemm'] = df['readme_clean'].apply(lemmatized)

### Stem words

- Understemming: Two related words should be reduced to the same stem but aren't. (False negative)


- Overstemming: Two unrelated words are reduced to the same stem even though they shouldn't be. (False positive)

- Consider Snowball stemmer aka Porter2

In [14]:
df['readme_stem'] = df['readme_clean'].apply(stemmerize_tool, stemmer_type=3)

In [15]:
df['readme_stem'].head()

0    simplifi blackjack procedur learn goal util co...
1                  rocket academi code basic blackjack
2    rlcard a toolkit for reinforc learn in card ga...
3    simplifi blackjack procedur learn goal util co...
4    21 coco creator 21 coco creator demo coco crea...
Name: readme_stem, dtype: object

### Remove stopwords

In [16]:
df['readme_stem_no_swords'] = df['readme_stem'].apply(remove_stopwords)

In [17]:
df['readme_lemm_no_swords'] = df['readme_lemm'].apply(remove_stopwords)

### Split data

In [18]:
train, val, test = train_validate(df)

In [19]:
train.language.value_counts()

Ruby                129
Java                 27
JavaScript           19
Python               18
C++                   5
C#                    3
Swift                 3
Objective-C           3
Lua                   2
CSS                   2
Kotlin                2
TypeScript            2
HTML                  1
Jupyter Notebook      1
PHP                   1
Go                    1
C                     1
Dart                  1
Vue                   1
Name: language, dtype: int64

### Explore Stems

In [20]:
big_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator):
    big_rams_stem.extend(row)

In [21]:
bi_stem_series = pd.Series(big_rams_stem)

In [22]:
trig_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_stem.extend(row)

In [23]:
tri_stem_series = pd.Series(trig_rams_stem)

In [24]:
bi_stem_series.value_counts().head()

(command, line)     1927
(card, total)       1375
(user, input)       1149
(method, method)    1126
(runner, method)    1125
dtype: int64

In [25]:
tri_stem_series.value_counts().head()

(command, line, app)         798
(method, take, argument)     500
(deal, new, card)            376
(command, line, interfac)    376
(h, hit, stay)               375
dtype: int64

### Explore Lemms

In [26]:
big_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator):
    big_rams_lemm.extend(row)

In [27]:
bi_lemm_series = pd.Series(big_rams_lemm)

In [28]:
trig_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_lemm.extend(row)

In [29]:
tri_lemm_series = pd.Series(trig_rams_lemm)

In [30]:
bi_lemm_series.value_counts().head()

(command, line)     1927
(card, total)       1375
(user, input)       1149
(method, method)    1126
(runner, method)    1125
dtype: int64

In [31]:
tri_lemm_series.value_counts().head()

(command, line, app)          673
(method, take, argument)      500
(deal, new, card)             376
(command, line, interface)    376
(runner, method, runner)      375
dtype: int64