In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import json
from requests import get
from json.decoder import JSONDecodeError
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
#You can also call the Porter by nltk.porter.PorterStemmer
from nltk.stem.snowball import EnglishStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

from nltk.stem import WordNetLemmatizer
from nltk.book import *
from nltk.text import Text

from pprint import pprint

from acquire import *
from prepare_nlp_josh import *
from modeling import *
import env

#These download may need to be run for the NLTK library:

#nltk.download('omw-1.4')
#nltk.download("punkt")
#nltk.download("stopwords")
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
#nltk.download("maxent_ne_chunker")
#nltk.download("words")
#nltk.download("book") #big download

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
links = get_links()

In [3]:
#When you get new links, delete this line.
#links.href = links.href.str[1:]

In [4]:
links.head()

Unnamed: 0.1,Unnamed: 0,href
0,0,/learn-co-students/simple-blackjack-cli-prework
1,1,/rocketacademy/basics-blackjack
2,2,/datamllab/rlcard
3,3,/learn-co-students/simple-blackjack-cli-001-pr...
4,4,/cocos-creator/tutorial-blackjack-deprecated


In [5]:
#Run this to get new data
df = get_repos(links.href)

In [6]:
#Run to save the data
#df.to_csv("origional_data.csv", index=False)

In [7]:
df.shape

(440, 3)

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(418, 3)

In [10]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,/learn-co-students/simple-blackjack-cli-prework,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
1,/rocketacademy/basics-blackjack,HTML,# Rocket Academy Coding Basics: Blackjack\n
2,/datamllab/rlcard,Python,# RLCard: A Toolkit for Reinforcement Learning...
3,/learn-co-students/simple-blackjack-cli-001-pr...,Ruby,# Simplified Blackjack - Procedural\n\n## Lear...
4,/cocos-creator/tutorial-blackjack-deprecated,JavaScript,# 21点游戏 - Cocos Creator 制造\n\n「21点游戏」是 Cocos C...


In [11]:
df = clean_languages(df)

In [12]:
df['readme_clean'] = df['readme_contents'].apply(basic_clean)

In [13]:
df['readme_clean'].head()

0     simplified blackjack  procedural\n\n learning...
1             rocket academy coding basics blackjack\n
2     rlcard a toolkit for reinforcement learning i...
3     simplified blackjack  procedural\n\n learning...
4     21  cocos creator \n\n21 cocos creator  demo ...
Name: readme_clean, dtype: object

### Tokenize words

In [14]:
df['readme_clean'] = df['readme_clean'].apply(tokenized, tokenize_tool=2)

In [15]:
df['readme_clean'].head()

0    simplified blackjack procedural learning goals...
1               rocket academy coding basics blackjack
2    rlcard a toolkit for reinforcement learning in...
3    simplified blackjack procedural learning goals...
4    21 cocos creator 21 cocos creator demo cocos c...
Name: readme_clean, dtype: object

### Lemm words

In [16]:
df['readme_lemm'] = df['readme_clean'].apply(lemmatized)

### Stem words

- Understemming: Two related words should be reduced to the same stem but aren't. (False negative)


- Overstemming: Two unrelated words are reduced to the same stem even though they shouldn't be. (False positive)

- Consider Snowball stemmer aka Porter2

In [17]:
df['readme_stem'] = df['readme_clean'].apply(stemmerize_tool, stemmer_type=3)

In [18]:
df['readme_stem'].head()

0    simplifi blackjack procedur learn goal util co...
1                  rocket academi code basic blackjack
2    rlcard a toolkit for reinforc learn in card ga...
3    simplifi blackjack procedur learn goal util co...
4    21 coco creator 21 coco creator demo coco crea...
Name: readme_stem, dtype: object

### Remove stopwords

In [19]:
df['readme_stem_no_swords'] = df['readme_stem'].apply(remove_stopwords)

In [20]:
df['readme_lemm_no_swords'] = df['readme_lemm'].apply(remove_stopwords)

### Split data

In [21]:
train, val, test = train_validate(df)

In [22]:
len(train)

200

In [23]:
train.language.value_counts()

Ruby          103
Java           25
JavaScript     22
Other          19
Python         16
C_based        15
Name: language, dtype: int64

### Explore Stems

In [24]:
big_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator):
    big_rams_stem.extend(row)

In [25]:
bi_stem_series = pd.Series(big_rams_stem)

In [26]:
trig_rams_stem = []
for row in train['readme_stem_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_stem.extend(row)

In [27]:
tri_stem_series = pd.Series(trig_rams_stem)

In [28]:
bi_stem_series.value_counts().head()

(command, line)     1548
(card, total)       1103
(user, input)        923
(method, method)     901
(runner, method)     900
dtype: int64

In [29]:
tri_stem_series.value_counts().head()

(command, line, app)         640
(method, take, argument)     400
(deal, new, card)            303
(h, hit, stay)               301
(command, line, interfac)    300
dtype: int64

### Explore Lemms

In [30]:
big_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator):
    big_rams_lemm.extend(row)

In [31]:
bi_lemm_series = pd.Series(big_rams_lemm)

In [32]:
trig_rams_lemm = []
for row in train['readme_lemm_no_swords'].apply(ngrams_creator, n_grams=3):
    trig_rams_lemm.extend(row)

In [33]:
tri_lemm_series = pd.Series(trig_rams_lemm)

In [34]:
bi_lemm_series.value_counts().head()

(command, line)     1548
(card, total)       1103
(user, input)        923
(method, method)     901
(runner, method)     900
dtype: int64

In [35]:
tri_lemm_series.value_counts().head()

(command, line, app)        540
(method, take, argument)    400
(deal, new, card)           303
(h, hit, stay)              301
(runner, method, runner)    300
dtype: int64

###  Logistic Regression Modeling on Stem

#### Term Frequency

In [36]:
#Work on train['readme_stem_no_swords']

In [37]:
list_of_stem_words = []
#words = pd.Series(document.split())
for item in train['readme_stem_no_swords']:
    list_of_stem_words.extend(item.split(' '))

In [38]:
stem_series = pd.Series(list_of_stem_words)

In [39]:
stem_words_df = (pd.DataFrame({'raw_count': stem_series.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [40]:
stem_words_df

Unnamed: 0,raw_count,frequency,augmented_frequency
method,8674,0.064810,1.000000
card,3134,0.023417,0.361310
test,2250,0.016811,0.259396
use,2098,0.015676,0.241872
line,1860,0.013898,0.214434
...,...,...,...
posso,1,0.000007,0.000115
descrev,1,0.000007,0.000115
lista,1,0.000007,0.000115
ordem,1,0.000007,0.000115


#### Inverse Document Frequency (IDF)

In [41]:
"""
def idf(word, df):
    n_occurences = sum([1 for doc in df if word in doc])
    return len(df) / n_occurences

# Get a list of the unique words
unique_words = pd.Series(list_of_stem_words).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))
 """

"\ndef idf(word, df):\n    n_occurences = sum([1 for doc in df if word in doc])\n    return len(df) / n_occurences\n\n# Get a list of the unique words\nunique_words = pd.Series(list_of_stem_words).unique()\n\n# put the unique words into a data frame\n(pd.DataFrame(dict(word=unique_words))\n # calculate the idf for each word\n .assign(idf=lambda df: df.word.apply(idf))\n # sort the data for presentation purposes\n .set_index('word')\n .sort_values(by='idf', ascending=False)\n .head(5))\n "

#### SKlearn Stem

In [42]:
tfidf_s = TfidfVectorizer()
tfidf_stem = tfidf_s.fit_transform(list_of_stem_words)
tfidf_stem

<133837x3984 sparse matrix of type '<class 'numpy.float64'>'
	with 131400 stored elements in Compressed Sparse Row format>

In [43]:
pd.DataFrame(tfidf_stem.todense(), columns=tfidf_s.get_feature_names())



Unnamed: 0,00,000,001,010,015,02,021101301,03,04,073,...,youaposl,youd,youll,yourtoken,youtub,youv,yum,zero,zerodepend,zuza
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modeling on Lemm

#### Term Frequency

In [44]:
#Work on train['readme_lemm_no_swords']

In [45]:
list_of_lemm_words = []
#words = pd.Series(document.split())
for item in train['readme_lemm_no_swords']:
    list_of_lemm_words.extend(item.split(' '))

In [46]:
lemm_series = pd.Series(list_of_lemm_words)

In [47]:
lemm_words_df = (pd.DataFrame({'raw_count': lemm_series.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [48]:
lemm_words_df

Unnamed: 0,raw_count,frequency,augmented_frequency
method,8674,0.065258,1.000000
card,3134,0.023578,0.361310
test,2115,0.015912,0.243832
line,1860,0.013993,0.214434
put,1830,0.013768,0.210975
...,...,...,...
cuidado,1,0.000008,0.000115
extremo,1,0.000008,0.000115
optar,1,0.000008,0.000115
uso,1,0.000008,0.000115


#### Inverse Document Frequency (IDF)

#### SKlearn Lemm

In [49]:
tfidf_l = TfidfVectorizer()
tfidf_lemm = tfidf_l.fit_transform(list_of_lemm_words)
tfidf_lemm

<132919x4671 sparse matrix of type '<class 'numpy.float64'>'
	with 130261 stored elements in Compressed Sparse Row format>

In [50]:
pd.DataFrame(tfidf_lemm.todense(), columns=tfidf_l.get_feature_names())



Unnamed: 0,00,000,001,010,015,02,021101301,03,04,073,...,youd,youll,youre,yourtoken,youtube,youve,yum,zero,zerodependencies,zuza
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modeling with Logistic Regression Classification

#### Stem

In [51]:
target_col = "language"

In [52]:
train_stem = train[['readme_stem_no_swords', 'language']].copy()
val_stem = val[['readme_stem_no_swords', 'language']].copy()
test_stem = test[['readme_stem_no_swords', 'language']].copy()

In [53]:
X_train0, y_train0, X_val0, y_val0, X_test0, y_test0 = train_val_test(train_stem, val_stem, test_stem, target_col)

In [54]:
X_train0.shape

(200, 1)

In [55]:
X_train0.head()

Unnamed: 0,readme_stem_no_swords
102,simplifi blackjack procedur learn goal util co...
275,evolutionarynet evolutionarynet artifici intel...
136,simplifi blackjack procedur learn goal util co...
63,blackjack react app blackjack card game built ...
394,simplifi blackjack procedur learn goal util co...


In [56]:
y_train0.head()

102       Ruby
275    C_based
136       Ruby
63       Other
394       Ruby
Name: language, dtype: object

In [57]:
X_train0.readme_stem_no_swords

102    simplifi blackjack procedur learn goal util co...
275    evolutionarynet evolutionarynet artifici intel...
136    simplifi blackjack procedur learn goal util co...
63     blackjack react app blackjack card game built ...
394    simplifi blackjack procedur learn goal util co...
                             ...                        
114    simplifi blackjack procedur learn goal util co...
361    blackjack vinteeum build statushttpstraviscior...
231    simplifi blackjack procedur learn goal util co...
12     diamondblackjack hello everyon ive spent past ...
168    simpl js html blackjack game build use jqueri ...
Name: readme_stem_no_swords, Length: 200, dtype: object

In [58]:
tfidf_s = TfidfVectorizer()
X_train0 = tfidf_s.fit_transform(X_train0.readme_stem_no_swords)
X_val0 = tfidf_s.transform(X_val0.readme_stem_no_swords)
X_test0 = tfidf_s.transform(X_test0.readme_stem_no_swords)

In [59]:
X_train0

<200x3984 sparse matrix of type '<class 'numpy.float64'>'
	with 43258 stored elements in Compressed Sparse Row format>

In [60]:
y_train0.shape

(200,)

In [61]:
train_s = pd.DataFrame(dict(actual=y_train0))
val_s = pd.DataFrame(dict(actual=y_val0))
#test_s = pd.DataFrame(dict(actual=y_test0))

In [62]:
lm_s = LogisticRegression().fit(X_train0, y_train0)

In [63]:
train_s['predicted'] = lm_s.predict(X_train0)
val_s['predicted'] = lm_s.predict(X_val0)
#test_s['predicted'] = lm_s.predict(X_test)

In [64]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_s.actual, train_s.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_s.predicted, train_s.actual))
print('---')
print(classification_report(train_s.actual, train_s.predicted))

Accuracy: 94.50%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Other  Python  Ruby
predicted                                                 
C_based          13     0           0      0       0     0
Java              2    25           3      2       1     3
JavaScript        0     0          19      0       0     0
Other             0     0           0     17       0     0
Python            0     0           0      0      15     0
Ruby              0     0           0      0       0   100
---
              precision    recall  f1-score   support

     C_based       1.00      0.87      0.93        15
        Java       0.69      1.00      0.82        25
  JavaScript       1.00      0.86      0.93        22
       Other       1.00      0.89      0.94        19
      Python       1.00      0.94      0.97        16
        Ruby       1.00      0.97      0.99       103

    accuracy                           0.94       200
   macro avg       0.95      0.92      0.93       20

In [65]:
print('Accuracy: {:.2%}'.format(accuracy_score(val_s.actual, val_s.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(val_s.predicted, val_s.actual))
print('---')
print(classification_report(val_s.actual, val_s.predicted))

Accuracy: 66.42%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Other  Python  Ruby
predicted                                                 
Java              8    15           6      7       8     4
JavaScript        1     1           4      6       0     1
Other             0     0           2      4       0     0
Python            1     0           0      0       3     0
Ruby              0     0           0      0       0    63
---
              precision    recall  f1-score   support

     C_based       0.00      0.00      0.00        10
        Java       0.31      0.94      0.47        16
  JavaScript       0.31      0.33      0.32        12
       Other       0.67      0.24      0.35        17
      Python       0.75      0.27      0.40        11
        Ruby       1.00      0.93      0.96        68

    accuracy                           0.66       134
   macro avg       0.51      0.45      0.42       134
weighted avg       0.72      0.66      0.65       134



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Lemm

In [66]:
train_lemm = train[['readme_lemm_no_swords', 'language']].copy()
val_lemm = val[['readme_lemm_no_swords', 'language']].copy()
test_lemm = test[['readme_lemm_no_swords', 'language']].copy()

In [67]:
X_train1, y_train1, X_val1, y_val1, X_test1, y_test1 = train_val_test(train_lemm, val_lemm, test_lemm, target_col)

In [68]:
tfidf_l = TfidfVectorizer()
X_train1 = tfidf_l.fit_transform(X_train1.readme_lemm_no_swords)
X_val1 = tfidf_l.transform(X_val1.readme_lemm_no_swords)
X_test1 = tfidf_l.transform(X_test1.readme_lemm_no_swords)

In [69]:
X_train1.shape

(200, 4671)

In [70]:
y_train1.shape

(200,)

In [71]:
train_l = pd.DataFrame(dict(actual=y_train1))
val_l = pd.DataFrame(dict(actual=y_val1))
#test_l = pd.DataFrame(dict(actual=y_test1))

In [72]:
lm_l = LogisticRegression().fit(X_train1, y_train1)

In [73]:
train_l['predicted'] = lm_l.predict(X_train1)
val_l['predicted'] = lm_l.predict(X_val1)
#test_l['predicted'] = lm_l.predict(X_test1)

In [74]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_l.actual, train_l.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train_l.predicted, train_l.actual))
print('---')
print(classification_report(train_l.actual, train_l.predicted))

Accuracy: 94.50%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Other  Python  Ruby
predicted                                                 
C_based          13     0           0      0       0     0
Java              2    25           3      2       1     3
JavaScript        0     0          19      0       0     0
Other             0     0           0     17       0     0
Python            0     0           0      0      15     0
Ruby              0     0           0      0       0   100
---
              precision    recall  f1-score   support

     C_based       1.00      0.87      0.93        15
        Java       0.69      1.00      0.82        25
  JavaScript       1.00      0.86      0.93        22
       Other       1.00      0.89      0.94        19
      Python       1.00      0.94      0.97        16
        Ruby       1.00      0.97      0.99       103

    accuracy                           0.94       200
   macro avg       0.95      0.92      0.93       20

In [75]:
print('Accuracy: {:.2%}'.format(accuracy_score(val_l.actual, val_l.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(val_l.predicted, val_l.actual))
print('---')
print(classification_report(val_l.actual, val_l.predicted))

Accuracy: 67.91%
---
Confusion Matrix
actual      C_based  Java  JavaScript  Other  Python  Ruby
predicted                                                 
Java              8    16           6      7       8     5
JavaScript        1     0           4      5       0     0
Other             0     0           2      5       0     0
Python            1     0           0      0       3     0
Ruby              0     0           0      0       0    63
---
              precision    recall  f1-score   support

     C_based       0.00      0.00      0.00        10
        Java       0.32      1.00      0.48        16
  JavaScript       0.40      0.33      0.36        12
       Other       0.71      0.29      0.42        17
      Python       0.75      0.27      0.40        11
        Ruby       1.00      0.93      0.96        68

    accuracy                           0.68       134
   macro avg       0.53      0.47      0.44       134
weighted avg       0.73      0.68      0.66       134



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Decission Tree Modeling

In [76]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test(train_lemm, val_lemm, test_lemm, target_col)

In [77]:
train_lemm.head()

Unnamed: 0,readme_lemm_no_swords,language
102,simplified blackjack procedural learning goal ...,Ruby
275,evolutionarynet evolutionarynet artificial int...,C_based
136,simplified blackjack procedural learning goal ...,Ruby
63,blackjack react app blackjack card game built ...,Other
394,simplified blackjack procedural learning goal ...,Ruby


In [78]:
tfidf = TfidfVectorizer()

In [79]:
X_train = tfidf.fit_transform(X_train['readme_lemm_no_swords'])

In [80]:
X_val = tfidf.transform(X_val['readme_lemm_no_swords'])

In [81]:
X_val = pd.DataFrame(X_val.todense())

In [82]:
X_train = pd.DataFrame(X_train.todense())

In [83]:
clf = DecisionTreeClassifier(max_depth=6, random_state=77)

In [84]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, random_state=77)

In [85]:
y_pred = clf.predict(X_train)

In [86]:
y_pred_proba = clf.predict_proba(X_train)

In [87]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [88]:
pd.crosstab(y_train, y_pred)

col_0,C_based,Java,JavaScript,Other,Python,Ruby
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C_based,7,8,0,0,0,0
Java,0,25,0,0,0,0
JavaScript,1,14,7,0,0,0
Other,1,12,0,6,0,0
Python,0,5,0,0,11,0
Ruby,1,1,0,0,0,101


In [89]:
val_pred = clf.predict(X_val)

In [90]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_val, y_val)))

Accuracy of Decision Tree classifier on training set: 0.63


#### Random Forest Modeling

In [91]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=250,
                            max_depth=6, 
                            random_state=77)

In [92]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=5, n_estimators=250,
                       random_state=77)

In [93]:
rf_pred = rf.predict(X_train)

In [94]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.78


In [95]:
rf2_pred = rf.predict(X_val)

In [96]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_val, y_val)))

Accuracy of random forest classifier on training set: 0.64


In [97]:
len(train_lemm[train_lemm['language'] == 'Ruby'])/len(train_lemm)

0.515