In [None]:
import os
import json
from typing import Dict, List
import requests
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from pprint import pprint
from requests import get

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from env import github_token
from env import github_username

import acquire_titanic

import prep

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer



Explore dataframes that are repositories that are in a github search of Titanic, and have the most starts. We pulled the top 180 repos

In [None]:
# Look at the data frame we have brought in.
df = acquire_titanic.scrape_github_data()
df.head()
df.info()

In [None]:
len(df)

Some of our repos do not have readme's

In [None]:
no_readme_df = df[df.readme_contents == 'error: no README']
no_readme_df

In [None]:
len(no_readme_df)

In [None]:
# make a copy of orginal df. This keeps the original df intact
#df_prep = df.copy()

In [None]:
df = df[df.readme_contents != 'error: no README']

In [None]:
len(df)

In [None]:
df = prep.prep_articles(df)

In [None]:
df.clean[112]

In [None]:
df.head()

In [None]:
df.language.value_counts(dropna=False)

In [None]:
labels = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

In [None]:
langs = list(labels.index)
langs

In [None]:
counts = list(labels.n)
counts

In [None]:
langs_counts = []
for x in range(len(counts)):
    langs_counts.append(f'{counts[x]} {langs[x]}')
langs_counts

In [None]:
import explore_charts

In [None]:
explore_charts.donut_chart(langs_counts, list(labels.n))

In [None]:
# explore the distribution of languages.
plt.rc('font', size=16)
plt.figure(figsize = (25,25))
labels.n.plot.bar()

In [None]:
df.columns

looking

In [None]:
easy_df = df.drop(columns =(['repo', 'original', 'stemmed', 'lemmatized']))
easy_df

In [None]:
breakouts = pd.concat([easy_df.language.value_counts(),
                    easy_df.language.value_counts(normalize=True)], axis=1)
breakouts.columns = ['n', 'percent']
breakouts

In [None]:
all_text = " ".join(easy_df.clean)
all_text = all_text.split()
all_text_counts = pd.Series(all_text).value_counts()

javascript_text = " ".join(easy_df[easy_df.language == "JavaScript"].clean)
javascript_text = javascript_text.split()
javascript_text_counts = pd.Series(javascript_text).value_counts()

jupyter_text = " ".join(easy_df[easy_df.language == "Jupyter Notebook"].clean)
jupyter_text = jupyter_text.split()
jupyter_text_counts = pd.Series(jupyter_text).value_counts()

python_text = " ".join(easy_df[easy_df.language == "Python"].clean)
python_text = python_text.split()
python_text_counts = pd.Series(python_text).value_counts()

java_text = " ".join(easy_df[easy_df.language == "Java"].clean)
java_text = java_text.split()
java_text_counts = pd.Series(java_text).value_counts()

cplus_text = " ".join(easy_df[easy_df.language == "C++"].clean)
cplus_text = cplus_text.split()
cplus_text_counts = pd.Series(cplus_text).value_counts()

go_text = " ".join(easy_df[easy_df.language == "Go"].clean)
go_text = go_text.split()
go_text_counts = pd.Series(go_text).value_counts()


In [None]:
word_counts = (pd.concat([all_text_counts, javascript_text_counts, jupyter_text_counts, python_text_counts, java_text_counts, cplus_text_counts, go_text_counts], axis=1, sort=True)
                .set_axis(['all', 'javascript', 'jupyter', 'python', 'java', 'cplus', 'go'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))
word_counts

In [None]:
word_counts.sort_values(by='all', ascending=False
                       ).head(10)

In [None]:
pd.concat([word_counts[word_counts.javascript == 0].sort_values(by='javascript').tail(6),
           word_counts[word_counts.jupyter == 0].sort_values(by='jupyter').tail(6),
          word_counts[word_counts.python == 0].sort_values(by='python').tail(6),
           word_counts[word_counts.java == 0].sort_values(by='java').tail(6),
           word_counts[word_counts.cplus == 0].sort_values(by='cplus').tail(6),
           word_counts[word_counts.go == 0].sort_values(by='go').tail(6)])

In [None]:
top_20_text_bigrams = (pd.Series(nltk.ngrams(all_text, 2))
                      .value_counts()
                      .head(20))

top_20_javascript = (pd.Series(nltk.ngrams(python_text, 2))
                      .value_counts()
                      .head(20))

top_20_jupyter_bigrams = (pd.Series(nltk.ngrams(jupyter_text, 2))
                      .value_counts()
                      .head(20))

top_20_python_bigrams = (pd.Series(nltk.ngrams(python_text, 2))
                      .value_counts()
                      .head(20))

top_20_java_bigrams = (pd.Series(nltk.ngrams(java_text, 2))
                      .value_counts()
                      .head(20))



top_20_cplus_bigrams = (pd.Series(nltk.ngrams(cplus_text, 2))
                      .value_counts()
                      .head(20))

top_20_go_bigrams = (pd.Series(nltk.ngrams(go_text, 2))
                      .value_counts()
                      .head(20))

# Modeling

In [None]:
tfidf = TfidfVectorizer()
tfidf

In [None]:
# drop where language == None
easy_df = easy_df[easy_df.language.isna() == False]

In [None]:
value_counts_df = pd.DataFrame(easy_df.language.value_counts(dropna = False))
value_counts_df

In [None]:
one_doc_langs = list(value_counts_df[value_counts_df.language < 2].index)
one_doc_langs

In [None]:
for x in one_doc_langs:
    easy_df = easy_df[easy_df.language != x]

In [None]:
easy_df.language.value_counts()

In [None]:
X = tfidf.fit_transform(easy_df.clean)
X

In [None]:
y= easy_df.language
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, test_size = .3)

In [None]:
train = pd.DataFrame(dict(acutal = y_train))
train.head()

In [None]:
test = pd.DataFrame(dict(actual=y_test))
test.head()

In [None]:
lm = LogisticRegression().fit(X_train, y_train)
lm

In [None]:
train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [None]:
train

In [None]:
accuracy_score(train.acutal, train.predicted)

In [None]:
80/163

In [None]:
accuracy_score(test.actual, test.predicted)

In [None]:
print(classification_report(train.acutal, train.predicted))

In [None]:
pd.crosstab(train.predicted, train.acutal)

In [None]:
easy_df = easy_df[easy_df.clean != 'error readme']
easy_df.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = .30, random_state = 123)
#X_train.head()

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=123)


In [None]:
clf.fit(X_train, y_train)


In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
rf.score(X_train,y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(2)
poly

In [None]:
poly.fit(X_train, y_train)

In [None]:
poly.fit_transform(X_train)

In [None]:
lm = LogisticRegression().fit(X_train, y_train)
