In [None]:
# import libaries
import pandas as pd
import numpy as np

# import visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning libaries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report
from sklearn import tree

# import NLP tool
from wordcloud import WordCloud
import re

# import self-created functions
import prepare
import project_acquire
import explore

# ignore warning
import warnings
warnings.filterwarnings("ignore")

# Wrangle

In [None]:
# use prepare module wrangle data function to acquire data
data=pd.read_json('data.json')

In [None]:
# check if we capture any duplicated repo/readme
data.duplicated().value_counts()

In [None]:
# we are going to drop those duplicated repo/readme
data=data.drop_duplicates()
data.duplicated().value_counts()

### Takeaway
- The readme acquired contains foreign language, we will going to drop those languages that is not in English, due to we acquired about 1000 rows, we met the requirement of at least 100 rows.

#### Install the package needed to detect the language
- \# pip install langdetect

### Key takeaway
- The most starred README data on github was collected on October 18, 2022 due to error code found, the previous dataset was deleted and re-collected on October 18th.
- The data has 1000 rows
- We found that the readme contains foreign language, therefore, we downloaded a langdetect package for further wrangle the dataset
- we also spot some 'none' value in our language, we will do further wrangle with that data as well
- Following this acquire, we are going to prepare for our exploration

In [None]:
# clean the data acquired
# We are acquire the data that is cleaned up with tokenized, stemmed, and lemmatized
# add those columns into the dataframe and create a final data frame
df=prepare.wrangle_data(data)

In [None]:
# we will going to drop 'none' value in our language column
df=df.dropna()
df.info()

In [None]:
# we will going to use the installed package 
# to filter out the readme contents that is in English only

# we are going to import a new libary for this
import langdetect as ld

# we created a new function to detect the non-english language in read me
# the function will return the result when it is not in english,, elso will not return the result
def is_en(txt):
    try:
        return ld.detect(txt)!='en'
    except:
        return False

# we applied the function we created 
nodf = df[df['readme_contents'].apply(is_en)]

In [None]:
# drop those that is not in english
df=df.drop(index=(nodf.index))
df.info()

- our dataframe should contains numerical data for better exploration, therefore, we are going to created some columns that able to represent the overall struture of the dataframe. we picked df.lemmatized

In [None]:
# we use the explore module to create those numerical columns other neccessary info for exploration
df=explore.feature_engineering(df)

In [None]:
# create a final local file for easy access
df.to_csv('data.csv')

In [None]:
# checking the describe
df.describe()

### Key takeaway so far
- The numerical columns created for better exploration
- The foreign language readme columns dropped to support our exploration
- we ended up have 669 columns remained
- The describe showed that there is significant jump in those counts, we believe we need to handle the outlier in our next step

In [None]:
# create a function to trim the outlier and drop null values again
def remove_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    new_df = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))
    return new_df

In [None]:
# check the data after trim the outlier
new_df=df[remove_outlier(df)]
new_df=new_df.dropna()
print('We drop ',round((1-len(new_df)/len(df))*100,2),'% rows due to outlier')

# Overall Wrangle Takeaway
- The data aquired on October 18th, 2022 with 1000 rows
- The data contains 'none' value in the language column, we dropped all 'none' values 
- The data contains foreign language in the readme contents column, we drop all those non-english values
- We create some numerical columns for further exploration
- we drop those outliers 
- We finalized with 586 rows, and ready for exploration

# Exploration

In [None]:
f = plt.figure(figsize=(25,20))
continuous_data = new_df.select_dtypes(exclude=['object'])
plt.suptitle("Distribution of Variable with Trend Line")
for i in range(0,8):
    f.add_subplot(4,2, i+1)
    sns.distplot(continuous_data.iloc[:,i], bins=3)
plt.show()
'''We are showing non-object histogram with a trendline, that the distribution of the data
is normal for unique and repeat, 
but interestingly, the word count and non-single count is left skew'''

In [None]:
# set up the environment to answer those questions.
# we find out that the top  5 languages are JavaScript, TypeScript, Python, Go and C++
# We are going to drop other columns but keep those top 5 languages
new_df.language.value_counts().head(10)

In [None]:
# we are going to create the new dataframe
a=['JavaScript', 'TypeScript', 'Python', 'Go', 'C++']
exp_df=new_df[new_df.language.isin(a)]

## Q1: What are the most common words in READMEs?

In [None]:
# get the freq from created module 
freq_df=explore.freq_df(exp_df)

In [None]:
# find out the top 10 most common word in read me
freq_df.sort_values(by='all',ascending=False).head(10)

In [None]:
# figure out the percentage of JavaScript versus Python top 10 most common word by proportion
plt.rcParams["figure.figsize"] = (15,7)
(freq_df
 .assign(p_js=freq_df.JavaScript / freq_df['all'],
         p_py=freq_df.Python / freq_df['all'],
        p_tS=freq_df.TypeScript / freq_df['all'],
        p_go=freq_df.Go / freq_df['all'],
        p_c=freq_df['C++'] / freq_df['all'])
 .sort_values(by='all')
 [['p_js','p_tS','p_go','p_c', 'p_py']]
 .tail(10)
 .sort_values('p_js')
 .plot.barh(stacked=True))
plt.title('Proportion of the 10 most common words by languages')
plt.show()

## Q1 Key Takeaway
- The tope ten most used words are: use,install,using,run,file,build,code,version,support,project
- The word build used significantly in c++ language while the javascript use all top 10 words consistently higher than others
- The word 'use' and 'run' use less frequently in C++ language  
- The word 'build' is less likely use in python

## Q2 Does the length of the README vary by programming language? If not whether the bigram different per language?

In [None]:
q2=exp_df.copy()
q2.groupby('language')['word_count'].mean()

#### Q2 key takeaway so far
- No, the length of the README is not vary by the programming language.
- We will explore bigram per language next to further explore whether the bigram is different by language

In [None]:
# bigram per language, we will extract already build function in explore module
q2_2=explore.bigram_clean(exp_df)
q2_2.sort_values(by='all_bigram',ascending=False).head(10)

In [None]:
plt.rcParams["figure.figsize"] = (15,7)
(q2_2
 .assign(p_js=q2_2.JavaScript / q2_2['all_bigram'],
         p_py=q2_2.Python / q2_2['all_bigram'],
        p_tS=q2_2.TypeScript / q2_2['all_bigram'],
        p_go=q2_2.Go / q2_2['all_bigram'],
        p_c=q2_2['C++'] / q2_2['all_bigram'])
 .sort_values(by='all_bigram')
 [['p_js','p_py','p_tS','p_go', 'p_c']]
 .tail(10)
 .sort_values('p_js')
 .plot.barh(stacked=True))
plt.title('Proportion of the 10 most common bigram words by languages')
plt.show()

## Q2: Key Takeaway:
- The Python and TypeScript has unique word that belong to them in bigram
- The language itself doesn't varied the progrma language readme length
- The Bigram has significant different in different language
- Interestingly, the python bigram ''et, al'' has not seem before, potentially suspected the code broke somewhere

## Q3: Do different programming languages use a different number of unique words?

In [None]:
explore.vis_cloud(exp_df)

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(exp_df.lemmatized)
y = exp_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=12, random_state=123)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(exp_df.lemmatized)
y = exp_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=17, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

In [None]:
cv = CountVectorizer(ngram_range=(2, 2))
X = cv.fit_transform(exp_df.lemmatized)
y = exp_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=16, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

In [None]:
cv = CountVectorizer(ngram_range=(3, 3))
X = cv.fit_transform(exp_df.lemmatized)
y = exp_df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=15, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')