In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import unicodedata
import re
import env
import prepare
from matplotlib import style
style.use("ggplot")
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

In [16]:
#make df from data.json
df = pd.read_json('data.json')
df.head()

Unnamed: 0,repo,language,readme_contents
0,mcastrolab/Brazil-Covid19-e0-change,R,# Reduction in life expectancy in Brazil after...
1,jschoeley/de0anim,R,# Animated annual changes in life-expectancy\n...
2,sychi77/Thoracic_Surgery_Patient_Survival,Jupyter Notebook,# Thoracic Surgery for Lung Cancer Data Set\n ...
3,ashtad63/HackerRank-Data-Scientist-Hiring-Test,Jupyter Notebook,# HackerRank Data Scientist Hiring Test: Predi...
4,OxfordDemSci/ex2020,R,"<p align=""center"">\n <img src=""https://github..."


In [17]:
# change readme_contents to original
df.rename(columns={'readme_contents':'original'}, inplace=True)
df.head()

Unnamed: 0,repo,language,original
0,mcastrolab/Brazil-Covid19-e0-change,R,# Reduction in life expectancy in Brazil after...
1,jschoeley/de0anim,R,# Animated annual changes in life-expectancy\n...
2,sychi77/Thoracic_Surgery_Patient_Survival,Jupyter Notebook,# Thoracic Surgery for Lung Cancer Data Set\n ...
3,ashtad63/HackerRank-Data-Scientist-Hiring-Test,Jupyter Notebook,# HackerRank Data Scientist Hiring Test: Predi...
4,OxfordDemSci/ex2020,R,"<p align=""center"">\n <img src=""https://github..."


In [18]:
df['clean'] = df['original'].apply(prepare.basic_clean)
df['lemmas'] = df['clean'].apply(prepare.lemmatize)
df['stemmed'] = df['clean'].apply(prepare.stem)
df['more_clean'] = df['lemmas'].apply(prepare.remove_stopwords) #lemmas with stop words removed

In [19]:
df.head()

Unnamed: 0,repo,language,original,clean,lemmas,stemmed,more_clean
0,mcastrolab/Brazil-Covid19-e0-change,R,# Reduction in life expectancy in Brazil after...,reduction in life expectancy in brazil after...,reduction in life expectancy in brazil after c...,reduct in life expect in brazil after covid 19...,reduction life expectancy brazil covid 19 prov...
1,jschoeley/de0anim,R,# Animated annual changes in life-expectancy\n...,animated annual changes in life expectancy\n...,animated annual change in life expectancy illu...,anim annual chang in life expect illustr of re...,animated annual change life expectancy illustr...
2,sychi77/Thoracic_Surgery_Patient_Survival,Jupyter Notebook,# Thoracic Surgery for Lung Cancer Data Set\n ...,thoracic surgery for lung cancer data set\n ...,thoracic surgery for lung cancer data set from...,thorac surgeri for lung cancer data set from u...,thoracic surgery lung cancer data set uci mach...
3,ashtad63/HackerRank-Data-Scientist-Hiring-Test,Jupyter Notebook,# HackerRank Data Scientist Hiring Test: Predi...,hackerrank data scientist hiring test predi...,hackerrank data scientist hiring test predict ...,hackerrank data scientist hire test predict li...,hackerrank data scientist hiring test predict ...
4,OxfordDemSci/ex2020,R,"<p align=""center"">\n <img src=""https://github...",p align center \n img src https github...,p align center img src http github com oxfordd...,p align center img src http github com oxfordd...,p align center img src http github com oxfordd...


In [20]:
# how many languages are there?
df['language'].value_counts()

Jupyter Notebook    46
R                   14
Python              12
HTML                10
JavaScript           7
Stata                4
Dart                 2
Java                 1
MATLAB               1
Shell                1
TypeScript           1
Scala                1
Name: language, dtype: int64

In [21]:
# keep top n languages
df = prepare.keep_top_n_languages(df, n_languages=3) # set to 2 for testing


In [22]:
# how many languages are there?
df['language'].value_counts()

Jupyter Notebook    46
other               36
R                   14
Python              12
Name: language, dtype: int64

In [23]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['more_clean'])
y = df['language']
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=42)
# X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, stratify=y_train, test_size=.3, random_state=42)

X_train, X_validate, X_test, y_train, y_validate, y_test = prepare.split_data(X, y)

In [24]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
validate['predicted'] = lm.predict(X_validate)
test['predicted'] = lm.predict(X_test)

In [25]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 68.33%
---
Confusion Matrix
actual            Jupyter Notebook  Python  R  other
predicted                                           
Jupyter Notebook                26       5  7      5
other                            0       1  1     15
---
                  precision    recall  f1-score   support

Jupyter Notebook       0.60      1.00      0.75        26
          Python       0.00      0.00      0.00         6
               R       0.00      0.00      0.00         8
           other       0.88      0.75      0.81        20

        accuracy                           0.68        60
       macro avg       0.37      0.44      0.39        60
    weighted avg       0.56      0.68      0.60        60



In [26]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

Accuracy: 50.00%
---
Confusion Matrix
actual            Jupyter Notebook  Python  R  other
predicted                                           
Jupyter Notebook                11       3  3      7
other                            0       0  0      2
---
                  precision    recall  f1-score   support

Jupyter Notebook       0.46      1.00      0.63        11
          Python       0.00      0.00      0.00         3
               R       0.00      0.00      0.00         3
           other       1.00      0.22      0.36         9

        accuracy                           0.50        26
       macro avg       0.36      0.31      0.25        26
    weighted avg       0.54      0.50      0.39        26



In [27]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 45.45%
---
Confusion Matrix
actual            Jupyter Notebook  Python  R  other
predicted                                           
Jupyter Notebook                 9       3  3      6
other                            0       0  0      1
---
                  precision    recall  f1-score   support

Jupyter Notebook       0.43      1.00      0.60         9
          Python       0.00      0.00      0.00         3
               R       0.00      0.00      0.00         3
           other       1.00      0.14      0.25         7

        accuracy                           0.45        22
       macro avg       0.36      0.29      0.21        22
    weighted avg       0.49      0.45      0.32        22

