In [None]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import openpyxl
import os

In [None]:
import pandas as pd
import openpyxl
def xlsx_to_csv_pd(path_xls):
    temp = path_xls.rsplit('.', 1)
    path_csv = temp[0] + '.csv'
    data_xls = pd.read_excel(path_xls, index_col=0)
    data_xls.to_csv(path_csv, encoding='utf-8')
    return path_csv

In [None]:
# all
path_xls = 'data/Classifer_BaseOnNPM/word/npm_all.csv'
# static
# path_xls = 'data/Classifer_BaseOnNPM/word/npm_static_except_noget.csv'

# path_csv = xlsx_to_csv_pd(path_xls)
df = pd.read_csv(path_xls, dtype=str)

# Change different columns here to select vectors
# Using 'description' as an example here
data = df['des']
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Tokenization, convert to string to remove NaN
tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]

In [None]:
vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]

best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0

# Create A Word2 Vec Model And Directly Build A Vocabulary
model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)

for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:
            # Update

            model.vector_size = vector_size
            # It Defines The Size Of The Context Window That The Model Considers During The Training Process The Larger The Window The More Context And The More Comprehensive The Semantics
            model.window = window_size
            # Define The Minimum Word Frequency Words Below Word Frequency Are Ignored
            model.min_count = min_count_value

            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            word_vectors = model.wv

            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            # Save The Best Model And Parameters
            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value
                
print("Best Parameters:")
print(best_params)

In [None]:
# Using The Best Model For Word Embedding
word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}

def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text

all_data_vectorized_des = {}
con_des = []
for sentence in tqdm(data):
    # If The Presence Is Not A Null Value Na N Perform Subsequent Operations
    if not pd.isnull(sentence):
        sentence_vector = vectorize_text(str(sentence), word_embeddings)

        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_des[sentence] = average_vector
            con_des.append(average_vector)
        else:
            # If No Word Vectors Are Found In The Sentence Consider Using All Zero Vectors Or Other Methods To Handle It
            all_data_vectorized_des[sentence] = np.zeros(50)
            con_des.append(np.zeros(50))

print(len(con_des))

In [None]:
# Replace Different Columns Here To Select Vectors
data = df['aut_main']
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]

vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]

best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0

model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)

for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:

            model.vector_size = vector_size
            model.window = window_size
            model.min_count = min_count_value

            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            word_vectors = model.wv
            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value

print("Best Parameters:")
print(best_params)

word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}


def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text


all_data_vectorized_aumain = {}
con_aumain = []
for sentence in tqdm(data):
    
    if not pd.isnull(sentence):
        sentence_vector = vectorize_text(str(sentence), word_embeddings)
        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_aumain[sentence] = average_vector
            con_aumain.append(average_vector)
        else:
            
            all_data_vectorized_aumain[sentence] = np.zeros(50)
            con_aumain.append(np.zeros(50))

print(len(con_aumain))

In [None]:
data = df['url_git']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]


vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]
best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0


model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)

for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:

            model.vector_size = vector_size
            model.window = window_size
            model.min_count = min_count_value
            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            
            word_vectors = model.wv
            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            
            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value


print("Best Parameters:")
print(best_params)

word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}
def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text


all_data_vectorized_url = {}
con_url = []
for sentence in tqdm(data):
    
    if not pd.isnull(sentence):
        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_url[sentence] = average_vector
            con_url.append(average_vector)
        else:
            
            all_data_vectorized_url[sentence] = np.zeros(50)
            con_url.append(np.zeros(50))
print(len(con_url))

In [None]:

data = df['dep_num']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]


vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]

best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0


model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)


for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:
            

            model.vector_size = vector_size
            
            model.window = window_size
            
            model.min_count = min_count_value

            
            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            
            word_vectors = model.wv

            
            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            
            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value


print("Best Parameters:")
print(best_params)


word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}


def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text


all_data_vectorized_dep = {}
con_dep = []
for sentence in tqdm(data):
    
    if not pd.isnull(sentence):
        sentence_vector = vectorize_text(str(sentence), word_embeddings)
        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_dep[sentence] = average_vector
            con_dep.append(average_vector)
        else:
            
            all_data_vectorized_dep[sentence] = np.zeros(50)
            con_dep.append(np.zeros(50))
print(len(con_dep))


In [None]:

data = df['static_APIs']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]


vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]

best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0


model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)


for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:
            

            model.vector_size = vector_size
            
            model.window = window_size
            
            model.min_count = min_count_value

            
            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            
            word_vectors = model.wv

            
            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            
            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value


print("Best Parameters:")
print(best_params)


word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}


def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text


all_data_vectorized_staticAPI = {}
con_static = []
for sentence in tqdm(data):
    
    if not pd.isnull(sentence):
        sentence_vector = vectorize_text(str(sentence), word_embeddings)
        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_staticAPI[sentence] = average_vector
            con_static.append(average_vector)
        else:
            
            all_data_vectorized_staticAPI[sentence] = np.zeros(50)
            con_static.append(np.zeros(50))
print(len(con_static))


In [None]:

data = df['Dynamic_APIs']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

tokenized_corpus = [word_tokenize(str(sentence).lower()) for sentence in tqdm(train_data) if not pd.isnull(sentence)]


vector_sizes = [50, 100, 200]
window_sizes = [5, 10, 15]
min_counts = [1, 5, 10]

best_model = None
best_params = {'vector_size': None, 'window': None, 'min_count': None}
best_similarity = 0.0


model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(tokenized_corpus)


for vector_size in vector_sizes:
    for window_size in window_sizes:
        for min_count_value in min_counts:
            

            model.vector_size = vector_size
            
            model.window = window_size
            
            model.min_count = min_count_value

            
            model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

            
            word_vectors = model.wv

            
            similarity_matrix = cosine_similarity([word_vectors[word] for word in word_vectors.index_to_key])
            avg_similarity = similarity_matrix.mean()

            
            if avg_similarity > best_similarity:
                best_similarity = avg_similarity
                best_model = model
                best_params['vector_size'] = vector_size
                best_params['window'] = window_size
                best_params['min_count'] = min_count_value


print("Best Parameters:")
print(best_params)


word_embeddings = {word: best_model.wv[word] for word in best_model.wv.index_to_key}


def vectorize_text(text, embeddings):
    tokens = word_tokenize(text.lower())
    vectorized_text = [embeddings[word] for word in tokens if word in embeddings]
    return vectorized_text


all_data_vectorized_DynaAPI = {}
con_Dynamic = []
for sentence in tqdm(data):
    
    if not pd.isnull(sentence):
        sentence_vector = vectorize_text(str(sentence), word_embeddings)
        if sentence_vector:
            average_vector = np.mean(sentence_vector, axis=0)
            all_data_vectorized_DynaAPI[sentence] = average_vector
            con_Dynamic.append(average_vector)
        else:
            
            all_data_vectorized_DynaAPI[sentence] = np.zeros(50)
            con_Dynamic.append(np.zeros(50))
        
print(len(con_Dynamic))


In [None]:
con_all=[]
## all
for item1, item2, item3, item4, item5, item6 in zip(con_des, con_aumain, con_url, con_dep, con_static, con_Dynamic):
## metadata
# for item1, item2, item3, item4 in zip(con_des, con_aumain, con_url, con_dep):
## static
# for item1 in zip(con_static):
## dynamic
# for item1 in zip(con_Dynamic):

    vector1 = np.array(item1)
    vector2 = np.array(item2)
    vector3 = np.array(item3)
    vector4 = np.array(item4)
    vector5 = np.array(item5)
    vector6 = np.array(item6)

# Horizontal Connection
    # all
    concatenated_horizontal = np.concatenate((vector1, vector2, vector3, vector4, vector5, vector6))
    # metadata
    # concatenated_horizontal = np.concatenate((vector1, vector2, vector3, vector4))
    # staic/dynamic
    # concatenated_horizontal = np.concatenate((vector1))
    concatenated_horizontal = concatenated_horizontal.tolist()
    con_all.append([concatenated_horizontal])
print(len(con_all))

In [None]:
# 4 kinds
csv_file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/all.csv'  
# csv_file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/dynamic.csv'  
# csv_file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/meta4.csv'  
# csv_file_path = 'data/Classifer_BaseOnNPM/embedding/pm_npm/static.csv'  

# Static without 'no get'
# csv_file_path = 'data/Classifer_BaseOnNPM/embedding/pm_static_noget/npm.csv'  

if os.path.exists(csv_file_path):
    os.remove(csv_file_path)
# Establish
workbook = openpyxl.Workbook()
workbook.save(csv_file_path)
workbook.close()

df_existing = pd.read_csv(csv_file_path)
df_new = pd.DataFrame([con_all,df['label']], columns=['feature','label'])
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
df_combined.to_csv(csv_file_path, index=False)