In [1]:
import pandas as pd
import os
import numpy as np


## Reading datasets

In [None]:
sample_data = pd.read_csv("../data/sample.csv")

In [None]:
sample_data.head()

In [None]:
val_data = pd.read_csv("../data/train.csv")

In [None]:
val_data.head()

In [None]:
val_data['patch'][0]

In [None]:
val_data.describe()

In [None]:
import re
import nltk
from collections import Counter
import scipy.sparse as sp
from numpy.linalg import norm

In [None]:
nltk.download()

In [None]:
class TFIDF(object):

    def __init__(self, corpus):        
        self.corpus = corpus
        self.norm_corpus  = None        

    def remove_stopwords(self, text, is_lower_case=False):
        #pattern = r'[^a-zA-z0-9\s]'
        stopword_list = nltk.corpus.stopwords.words('english')

        pattern = r'\n|\t'
        text = re.sub(pattern," ",''.join(text))
        tokens = nltk.word_tokenize(text)
        tokens = [tok.strip() for tok in tokens]
        if is_lower_case:
            cleaned_token = [tok for tok in tokens if tok not in stopword_list]
        else:
            cleaned_tokens = [tok for tok in tokens if tok.lower() not in stopword_list]
        filtered_text = ' '.join(cleaned_tokens)
        return filtered_text
    def __normalize_corpus(self, d):
        stop_words = nltk.corpus.stopwords.words('english')
        d = re.sub(r'[^a-zA-Z0-9\s]', '', d, re.I|re.A)
        d = d.lower().strip()
        tks = nltk.word_tokenize(d)
        f_tks = [t for t in tks if t not in stop_words]
        return ' '.join(f_tks)

    def preprocessing_text(self):
        n_c = np.vectorize(self.remove_stopwords)
        self.norm_corpus = n_c(self.corpus)

    def tf(self):
        words_array = [doc.split() for doc in self.norm_corpus]
        words = list(set([word for words in words_array for word in words]))
        features_dict = {w:0 for w in words}
        tf = []
        for doc in self.norm_corpus:
            bowf_doc = Counter(doc.split())
            all_f = Counter(features_dict)
            bowf_doc.update(all_f)
            tf.append(bowf_doc)
        return pd.DataFrame(tf)

    def df(self, tf):
        features_names = list(tf.columns)
        df = np.diff(sp.csc_matrix(tf, copy=True).indptr)
        df = 1 + df
        return df
        
    def idf(self, df):
        N = 1 + len(self.norm_corpus)
        idf = (1.0 + np.log(float(N) / df)) 
        idf_d = sp.spdiags(idf, diags= 0, m=len(df), n= len(df)).todense()      
        return idf, idf_d

    def tfidf(self, tf, idf):        
        tf = np.array(tf, dtype='float64')
        tfidf = tf * idf
        norms = norm(tfidf , axis=1)
        return (tfidf / norms[:,None])

In [None]:
val_data.columns.values.tolist()

In [None]:
#pd.Series({c: val_data[c].unique() for c in val_data})
val_data["lang"].unique()

In [None]:
val_data["del_lines"].describe()

In [None]:
test= val_data["Summary"].tolist()
test[9]

In [None]:
test= val_data["Summary"].dropna().tolist()
#test = test[8:10]
test

In [None]:
data= val_data["func_before"].dropna().tolist()

In [None]:
a = TFIDF(data[:100])

In [None]:
a.preprocessing_text()

In [None]:
tf = a.tf()
tf

In [None]:
df = a.df(tf)
df

In [None]:
idf, idf_d = a.idf(df)


In [None]:
tfidf = a.tfidf(tf, idf)


In [None]:
df = pd.DataFrame(np.round(tfidf,2), columns= list(tf.columns))
sorted_column_df = df.sort_index(axis=1)
sorted_column_df

In [None]:
sorted_column_df = sorted_column_df.stack().reset_index()

In [None]:
sorted_column_df

In [None]:
tfidf_df = sorted_column_df.rename(columns={0:'tfidf', 'level_0': 'snippet','level_1': 'term', 'level_2': 'count'})
tfidf_df.sort_values(by=['snippet','tfidf'], ascending=[True,False]).groupby(['snippet']).head(10)


## Testing TF-IDF for natural laguage

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from pathlib import Path  

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words=stopword_list)


In [None]:
tfidf_vector = tfidf_vectorizer.fit_transform(data[:100])
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df

In [None]:
tfidf_vector

In [None]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [None]:
tfidf_df

In [None]:
tfidf_df.columns.values.tolist()

In [None]:
tfidf_slice = tfidf_df[['commit', 'android', 'access', 'amd', 'tcp']]
tfidf_slice

In [None]:
tfidf_df.stack().reset_index()


In [None]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)



In [None]:
import altair as alt


In [None]:
# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)

In [None]:
data= val_data["func_before"].dropna().tolist()

In [None]:
data

In [None]:
tfidf_vector = tfidf_vectorizer.fit_transform(data[:100])
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df

In [None]:
tfidf_df.loc['total'] = (tfidf_df > 0).sum()


In [None]:
tfidf_df

In [None]:
tfidf_df = tfidf_df.drop('total', errors='ignore')


In [None]:
tfidf_df.stack().reset_index()


In [None]:
tfidf_df = tfidf_df.stack().reset_index()


In [None]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'snippet','level_1': 'term', 'level_2': 'term'})

In [None]:
tfidf_df.sort_values(by=['snippet','tfidf'], ascending=[True,False]).groupby(['snippet']).head(10)


In [None]:
top_tfidf = tfidf_df.sort_values(by=['snippet','tfidf'], ascending=[True,False]).groupby(['snippet']).head(20)

In [None]:
top_tfidf[top_tfidf['term'].str.contains('for')]

In [None]:
top_tfidf

In [None]:
# Terms in this list will get a red dot in the visualization
term_list = ['for', 'if']
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

In [None]:



# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'snippet:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["snippet"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)



In [None]:
top_tfidf_plusRand


In [None]:
# display the three superimposed visualizations
(heatmap + text).properties(width = 1000)

In [None]:
remove_stopwords(data[0])

In [None]:
circle

In [None]:
text

## Testing gensim

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english') 

In [None]:
stopword_list

In [None]:
def remove_stopwords(text, is_lower_case=False):
    #pattern = r'[^a-zA-z0-9\s]'
    pattern = r'\n|\t'
    text = re.sub(pattern," ",''.join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [tok.strip() for tok in tokens]
    if is_lower_case:
        cleaned_token = [tok for tok in tokens if tok not in stopword_list]
    else:
        cleaned_tokens = [tok for tok in tokens if tok.lower() not in stopword_list]
    filtered_text = ' '.join(cleaned_tokens)
    return filtered_text

In [None]:
remove_stopwords(data[0])


In [None]:
glove_vectors = dict()

for line in data[:100]:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:])
    glove_vectors[word] = vectors
