In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import json
from requests import get
from json.decoder import JSONDecodeError
from bs4 import BeautifulSoup
import time

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.snowball import EnglishStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

from nltk.stem import WordNetLemmatizer
from nltk.book import *
from nltk.text import Text

from pprint import pprint

from acquire import *
from prepare_nlp_josh import *
from modeling import *
import env

#These download may need to be run for the NLTK library:

#nltk.download('omw-1.4')
#nltk.download("punkt")
#nltk.download("stopwords")
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
#nltk.download("maxent_ne_chunker")
#nltk.download("words")
#nltk.download("book") #big download

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
links = get_links()

In [3]:
#When you get new links, delete this line.
#links.href = links.href.str[1:]

In [4]:
links.head()

Unnamed: 0.1,Unnamed: 0,href
0,0,/learn-co-students/simple-blackjack-cli-prework
1,1,/rocketacademy/basics-blackjack
2,2,/datamllab/rlcard
3,3,/learn-co-students/simple-blackjack-cli-001-pr...
4,4,/cocos-creator/tutorial-blackjack-deprecated


In [5]:
#Run this to get new data
df = get_repos(links.href)

In [6]:
#Run to save the data
#df.to_csv("origional_data.csv", index=False)

In [7]:
df.shape

(440, 3)

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(418, 3)

In [10]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,/learn-co-students/simple-blackjack-cli-prework,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
1,/rocketacademy/basics-blackjack,HTML,# Rocket Academy Coding Basics: Blackjack\n
2,/datamllab/rlcard,Python,# RLCard: A Toolkit for Reinforcement Learning...
3,/learn-co-students/simple-blackjack-cli-001-pr...,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
4,/cocos-creator/tutorial-blackjack-deprecated,JavaScript,# 21点游戏 - Cocos Creator 制造\n\n「21点游戏」是 Cocos C...


In [11]:
df = clean_languages(df)

In [12]:
df['readme_clean'] = df['readme_contents'].apply(basic_clean)

In [13]:
df['readme_clean'].head()

0     simplified blackjack  procedural\n\n learning...
2     rlcard a toolkit for reinforcement learning i...
3     simplified blackjack  procedural\n\n learning...
4     21  cocos creator \n\n21 cocos creator  demo ...
5     simplified blackjack  procedural\n\n learning...
Name: readme_clean, dtype: object

### Tokenize words

In [14]:
df['readme_clean'] = df['readme_clean'].apply(tokenized, tokenize_tool=2)

In [15]:
df['readme_clean'].head()

0    simplified blackjack procedural learning goals...
2    rlcard a toolkit for reinforcement learning in...
3    simplified blackjack procedural learning goals...
4    21 cocos creator 21 cocos creator demo cocos c...
5    simplified blackjack procedural learning goals...
Name: readme_clean, dtype: object

### Lemm words

In [16]:
df['readme_lemm'] = df['readme_clean'].apply(lemmatized)

### Stem words

- Understemming: Two related words should be reduced to the same stem but aren't. (False negative)


- Overstemming: Two unrelated words are reduced to the same stem even though they shouldn't be. (False positive)

- Consider Snowball stemmer aka Porter2

In [17]:
df['readme_stem'] = df['readme_clean'].apply(stemmerize_tool, stemmer_type=3)

In [18]:
df['readme_stem'].head()

0    simplifi blackjack procedur learn goal util co...
2    rlcard a toolkit for reinforc learn in card ga...
3    simplifi blackjack procedur learn goal util co...
4    21 coco creator 21 coco creator demo coco crea...
5    simplifi blackjack procedur learn goal util co...
Name: readme_stem, dtype: object

### Remove stopwords

In [19]:
more_stopwords = ['stopswords', 'blackjack', 'java', 'cards', 'split', 'ace', 'variables', 'conditional', 'statements', 'loops', 'functions', 'object-oriented programming', 'syntax', 'comments', 'libraries', 'frameworks']

In [20]:
df['readme_stem_no_swords'] = df['readme_stem'].apply(remove_stopwords, extra_words=more_stopwords)

In [21]:
df['readme_lemm_no_swords'] = df['readme_lemm'].apply(remove_stopwords, extra_words=more_stopwords)

In [22]:
df = df[df['language'] != 'Other']

### Split data

In [23]:
train, val, test = train_validate(df)

In [24]:
len(train)

180

In [25]:
train.language.value_counts()

Ruby          98
Java          26
JavaScript    22
Python        21
C_based       13
Name: language, dtype: int64

### Explore Stems

In [26]:
big_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator):
    big_rams_stem.extend(row)

In [27]:
bi_stem_series = pd.Series(big_rams_stem)

In [28]:
trig_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_stem.extend(row)

In [29]:
tri_stem_series = pd.Series(trig_rams_stem)

In [30]:
bi_stem_series.value_counts().head()

(command, line)     1448
(card, total)       1036
(user, input)        867
(method, method)     847
(runner, method)     846
dtype: int64

In [31]:
tri_stem_series.value_counts().head()

(command, line, app)         594
(method, take, argument)     376
(deal, new, card)            285
(command, line, interfac)    283
(h, hit, stay)               282
dtype: int64

### Explore Lemms

In [32]:
big_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator):
    big_rams_lemm.extend(row)

In [33]:
bi_lemm_series = pd.Series(big_rams_lemm)

In [34]:
trig_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_lemm.extend(row)

In [35]:
tri_lemm_series = pd.Series(trig_rams_lemm)

In [36]:
bi_lemm_series.value_counts().head()

(command, line)     1448
(card, total)       1036
(user, input)        864
(method, method)     847
(runner, method)     846
dtype: int64

In [37]:
tri_lemm_series.value_counts().head()

(command, line, app)          500
(method, take, argument)      376
(deal, new, card)             285
(command, line, interface)    283
(runner, method, runner)      282
dtype: int64

###  Logistic Regression Modeling on Stem

#### Term Frequency

In [38]:
#Work on train['readme_stem_no_swords']

In [39]:
list_of_stem_words = []
#words = pd.Series(document.split())
for item in train['readme_stem_no_swords']:
    list_of_stem_words.extend(item.split(' '))

In [40]:
stem_series = pd.Series(list_of_stem_words)

In [41]:
stem_words_df = (pd.DataFrame({'raw_count': stem_series.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [42]:
stem_words_df

Unnamed: 0,raw_count,frequency,augmented_frequency
method,8060,0.068937,1.000000
card,2863,0.024487,0.355211
test,2063,0.017645,0.255955
use,1841,0.015746,0.228412
line,1740,0.014882,0.215881
...,...,...,...
documentationhttpsreactjsorg,1,0.000009,0.000124
httpsfacebookgithubiocreatereactappdocscodesplittinghttpsfacebookgithubiocreatereactappdocscodesplit,1,0.000009,0.000124
makebat,1,0.000009,0.000124
httpsfacebookgithubiocreatereactappdocsanalyzingthebundlesizehttpsfacebookgithubiocreatereactappdocsanalyzingthebundles,1,0.000009,0.000124


#### Inverse Document Frequency (IDF)

In [43]:
"""
def idf(word, df):
    n_occurences = sum([1 for doc in df if word in doc])
    return len(df) / n_occurences

# Get a list of the unique words
unique_words = pd.Series(list_of_stem_words).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))
 """

"\ndef idf(word, df):\n    n_occurences = sum([1 for doc in df if word in doc])\n    return len(df) / n_occurences\n\n# Get a list of the unique words\nunique_words = pd.Series(list_of_stem_words).unique()\n\n# put the unique words into a data frame\n(pd.DataFrame(dict(word=unique_words))\n # calculate the idf for each word\n .assign(idf=lambda df: df.word.apply(idf))\n # sort the data for presentation purposes\n .set_index('word')\n .sort_values(by='idf', ascending=False)\n .head(5))\n "

#### SKlearn Stem

In [44]:
tfidf_s = TfidfVectorizer()
tfidf_stem = tfidf_s.fit_transform(list_of_stem_words)
tfidf_stem

<116918x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 114881 stored elements in Compressed Sparse Row format>

In [45]:
pd.DataFrame(tfidf_stem.todense(), columns=tfidf_s.get_feature_names())



Unnamed: 0,000,001,01,010,02,021101301,03,032,033,04,...,yield,yn,youd,youll,yourgithubusernam,youtub,youv,zerodepend,zip,zuza
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modeling on Lemm

#### Term Frequency

In [46]:
#Work on train['readme_lemm_no_swords']

In [47]:
list_of_lemm_words = []
#words = pd.Series(document.split())
for item in train['readme_lemm_no_swords']:
    list_of_lemm_words.extend(item.split(' '))

In [48]:
lemm_series = pd.Series(list_of_lemm_words)

In [49]:
lemm_words_df = (pd.DataFrame({'raw_count': lemm_series.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [50]:
lemm_words_df

Unnamed: 0,raw_count,frequency,augmented_frequency
method,8060,0.069485,1.000000
card,2863,0.024682,0.355211
test,1956,0.016863,0.242680
line,1740,0.015000,0.215881
put,1714,0.014776,0.212655
...,...,...,...
readded,1,0.000009,0.000124
47,1,0.000009,0.000124
45,1,0.000009,0.000124
diagramimagesblackjackactivitydiagrampng,1,0.000009,0.000124


#### Inverse Document Frequency (IDF)

#### SKlearn Lemm

In [51]:
tfidf_l = TfidfVectorizer()
tfidf_lemm = tfidf_l.fit_transform(list_of_lemm_words)
tfidf_lemm

<115997x3536 sparse matrix of type '<class 'numpy.float64'>'
	with 113763 stored elements in Compressed Sparse Row format>

In [52]:
pd.DataFrame(tfidf_lemm.todense(), columns=tfidf_l.get_feature_names())



Unnamed: 0,000,001,01,010,02,021101301,03,032,033,04,...,yn,youd,youll,youre,yourgithubusername,youtube,youve,zerodependencies,zip,zuza
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modeling with Logistic Regression Classification

#### Stem

In [53]:
target_col = "language"

In [54]:
train_stem = train[['readme_stem_no_swords', 'language']].copy()
val_stem = val[['readme_stem_no_swords', 'language']].copy()
test_stem = test[['readme_stem_no_swords', 'language']].copy()

In [55]:
X_train0, y_train0, X_val0, y_val0, X_test0, y_test0 = train_val_test(train_stem, val_stem, test_stem, target_col)

In [56]:
X_train0.shape

(180, 1)

In [57]:
X_train0.head()

Unnamed: 0,readme_stem_no_swords
326,simplifi procedur learn goal util condit logic...
349,game intent record probabl use exercis c well ...
322,simplifi procedur learn goal util condit logic...
218,simplifi procedur object 1 util condit logic l...
346,popular card game play casino around world gam...


In [58]:
y_train0.head()

326          Ruby
349       C_based
322          Ruby
218          Ruby
346    JavaScript
Name: language, dtype: object

In [59]:
X_train0.readme_stem_no_swords

326    simplifi procedur learn goal util condit logic...
349    game intent record probabl use exercis c well ...
322    simplifi procedur learn goal util condit logic...
218    simplifi procedur object 1 util condit logic l...
346    popular card game play casino around world gam...
                             ...                        
230    simplifi procedur learn goal util condit logic...
29     licenc bsdhttpcreativecommonsorglicensesbsd ga...
334    simplifi procedur learn goal util condit logic...
280                                                     
182    simplifi procedur learn goal util condit logic...
Name: readme_stem_no_swords, Length: 180, dtype: object

In [60]:
tfidf_s = TfidfVectorizer()
X_train0 = tfidf_s.fit_transform(X_train0.readme_stem_no_swords)
X_val0 = tfidf_s.transform(X_val0.readme_stem_no_swords)
X_test0 = tfidf_s.transform(X_test0.readme_stem_no_swords)

In [61]:
X_train0

<180x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 37916 stored elements in Compressed Sparse Row format>

In [62]:
y_train0.shape

(180,)

In [63]:
train_s = pd.DataFrame(dict(actual=y_train0))
val_s = pd.DataFrame(dict(actual=y_val0))
#test_s = pd.DataFrame(dict(actual=y_test0))

In [64]:
lm_s = LogisticRegression().fit(X_train0, y_train0)

In [65]:
train_s['predicted'] = lm_s.predict(X_train0)
val_s['predicted'] = lm_s.predict(X_val0)
#test_s['predicted'] = lm_s.predict(X_test)

In [66]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_s.actual, train_s.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_s.predicted, train_s.actual))
print('---')
print(classification_report(train_s.actual, train_s.predicted))

Accuracy: 92.22%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Python  Ruby
predicted                                          
C_based           5     0           0       0     0
Java              7    26           4       1     1
JavaScript        0     0          18       0     0
Python            1     0           0      20     0
Ruby              0     0           0       0    97
---
              precision    recall  f1-score   support

     C_based       1.00      0.38      0.56        13
        Java       0.67      1.00      0.80        26
  JavaScript       1.00      0.82      0.90        22
      Python       0.95      0.95      0.95        21
        Ruby       1.00      0.99      0.99        98

    accuracy                           0.92       180
   macro avg       0.92      0.83      0.84       180
weighted avg       0.95      0.92      0.92       180



In [67]:
print('Accuracy: {:.2%}'.format(accuracy_score(val_s.actual, val_s.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(val_s.predicted, val_s.actual))
print('---')
print(classification_report(val_s.actual, val_s.predicted))

Accuracy: 74.17%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Python  Ruby
predicted                                          
Java              8    15           9       7     4
JavaScript        0     1           2       0     0
Python            1     1           0       3     0
Ruby              0     0           0       0    69
---
              precision    recall  f1-score   support

     C_based       0.00      0.00      0.00         9
        Java       0.35      0.88      0.50        17
  JavaScript       0.67      0.18      0.29        11
      Python       0.60      0.30      0.40        10
        Ruby       1.00      0.95      0.97        73

    accuracy                           0.74       120
   macro avg       0.52      0.46      0.43       120
weighted avg       0.77      0.74      0.72       120



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Lemm

In [68]:
train_lemm = train[['readme_lemm_no_swords', 'language']].copy()
val_lemm = val[['readme_lemm_no_swords', 'language']].copy()
test_lemm = test[['readme_lemm_no_swords', 'language']].copy()

In [69]:
X_train1, y_train1, X_val1, y_val1, X_test1, y_test1 = train_val_test(train_lemm, val_lemm, test_lemm, target_col)

In [70]:
tfidf_l = TfidfVectorizer()
X_train1 = tfidf_l.fit_transform(X_train1.readme_lemm_no_swords)
X_val1 = tfidf_l.transform(X_val1.readme_lemm_no_swords)
X_test1 = tfidf_l.transform(X_test1.readme_lemm_no_swords)

In [71]:
X_train1.shape

(180, 3536)

In [72]:
y_train1.shape

(180,)

In [73]:
train_l = pd.DataFrame(dict(actual=y_train1))
val_l = pd.DataFrame(dict(actual=y_val1))
#test_l = pd.DataFrame(dict(actual=y_test1))

In [74]:
lm_l = LogisticRegression().fit(X_train1, y_train1)

In [75]:
train_l['predicted'] = lm_l.predict(X_train1)
val_l['predicted'] = lm_l.predict(X_val1)
#test_l['predicted'] = lm_l.predict(X_test1)

In [76]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_l.actual, train_l.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_l.predicted, train_l.actual))
print('---')
print(classification_report(train_l.actual, train_l.predicted))

Accuracy: 92.22%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Python  Ruby
predicted                                          
C_based           5     0           0       0     0
Java              7    26           4       1     1
JavaScript        0     0          18       0     0
Python            1     0           0      20     0
Ruby              0     0           0       0    97
---
              precision    recall  f1-score   support

     C_based       1.00      0.38      0.56        13
        Java       0.67      1.00      0.80        26
  JavaScript       1.00      0.82      0.90        22
      Python       0.95      0.95      0.95        21
        Ruby       1.00      0.99      0.99        98

    accuracy                           0.92       180
   macro avg       0.92      0.83      0.84       180
weighted avg       0.95      0.92      0.92       180



In [77]:
print('Accuracy: {:.2%}'.format(accuracy_score(val_l.actual, val_l.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(val_l.predicted, val_l.actual))
print('---')
print(classification_report(val_l.actual, val_l.predicted))

Accuracy: 74.17%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Python  Ruby
predicted                                          
Java              7    15           9       7     4
JavaScript        1     2           2       0     0
Python            1     0           0       3     0
Ruby              0     0           0       0    69
---
              precision    recall  f1-score   support

     C_based       0.00      0.00      0.00         9
        Java       0.36      0.88      0.51        17
  JavaScript       0.40      0.18      0.25        11
      Python       0.75      0.30      0.43        10
        Ruby       1.00      0.95      0.97        73

    accuracy                           0.74       120
   macro avg       0.50      0.46      0.43       120
weighted avg       0.76      0.74      0.72       120



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Decission Tree Modeling

In [78]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test(train_lemm, val_lemm, test_lemm, target_col)

In [79]:
train_lemm.head()

Unnamed: 0,readme_lemm_no_swords,language
326,simplified procedural learning goal utilize lo...,Ruby
349,game intent recording probability used exercis...,C_based
322,simplified procedural learning goal utilize lo...,Ruby
218,simplified procedural objective 1 utilize logi...,Ruby
346,popular card game played casino around world g...,JavaScript


In [80]:
tfidf = TfidfVectorizer()

In [81]:
X_train = tfidf.fit_transform(X_train['readme_lemm_no_swords'])

In [82]:
X_val = tfidf.transform(X_val['readme_lemm_no_swords'])

In [83]:
X_val = pd.DataFrame(X_val.todense())

In [84]:
X_train = pd.DataFrame(X_train.todense())

In [85]:
clf = DecisionTreeClassifier(max_depth=6, random_state=77)

In [86]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, random_state=77)

In [87]:
y_pred = clf.predict(X_train)

In [88]:
y_pred_proba = clf.predict_proba(X_train)

In [89]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.86


In [90]:
pd.crosstab(y_train, y_pred)

col_0,C_based,Java,JavaScript,Python,Ruby
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C_based,4,9,0,0,0
Java,0,26,0,0,0
JavaScript,0,11,11,0,0
Python,0,4,0,17,0
Ruby,0,2,0,0,96


In [91]:
val_pred = clf.predict(X_val)

In [92]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_val, y_val)))

Accuracy of Decision Tree classifier on training set: 0.78


#### Random Forest Modeling

In [93]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=250,
                            max_depth=6, 
                            random_state=77)

In [94]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=5, n_estimators=250,
                       random_state=77)

In [95]:
rf_pred = rf.predict(X_train)

In [96]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.79


In [97]:
rf2_pred = rf.predict(X_val)

In [98]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_val, y_val)))

Accuracy of random forest classifier on training set: 0.73


In [99]:
len(train_lemm[train_lemm['language'] == 'Ruby'])/len(train_lemm)

0.5444444444444444