In [None]:
from nlp_ai_utils import *
from chunks_urls import CHUNKS_URLS

In [None]:
TF_ENABLE_ONEDNN_OPTS = 0
URLS = CHUNKS_URLS
LIMIT = 60
LARGER_DATASET_PATH = "../larger_dataset"
PREPROCESSED_CHUNKS_PATH = LARGER_DATASET_PATH + "/preprocessed_data_chunks/"
BASE_FILE_NAME = "chunk_"
FILE_FORMAT = ".csv"
ACTUAL_DATA_SHAPE = 6990280
RANDOM_STATE = 42

In [None]:
def get_chunks(urls,limit=0,verbose = 1,base_name = "",file_path="",file_format='.csv'):
    #downloads all data from their urls
    for i,url in enumerate(urls):
        if limit:
            if i == limit:
                return
        file_name = base_name + str(i + 1)
        #checks if file already exists
        if os.path.exists(file_path + file_name + file_format):
            print(f"{file_name} already exists.")
            continue
        if i % verbose == 0:
            print(f"Downloading {file_name}...")
        r = requests.get(url)
        with open(file_path + file_name + file_format, 'wb') as fd:
            for chunk in r.iter_content():
                #save file in the current directory of the notebook
                fd.write(chunk)

In [None]:
def get_all_file_names(base_name,limit_num):
    return [base_name + str(num) for num in range(1,limit_num + 1)]

In [None]:
def read_chunks(files,file_path = "",file_format = ".csv"):
    #reads chunks csvs and converts them to a dataframe format
    final_df = []
    for file in files:
        df = pd.read_csv(file_path + file + file_format)
        final_df.append(df)
    #function returns a list of dfs
    return final_df

In [None]:
#get all names of downloaded files
all_file_names = get_all_file_names(BASE_FILE_NAME,LIMIT)

In [None]:
#read all chunks into a list
list_dfs = read_chunks(all_file_names,PREPROCESSED_CHUNKS_PATH,FILE_FORMAT)

In [None]:
def group_up_chunks(dfs):
    #adds up all dataframes together
    return pd.concat(dfs)

In [None]:
#concatenate all chunks into a singular df
df = group_up_chunks(list_dfs)

In [None]:
review_data = df[['text', 'stars']].copy()

In [None]:
review_data.reset_index(inplace = True)
review_data.drop(['index'],axis = 1,inplace = True)

In [None]:
review_data.rename(columns = {'text':'full_review_text','stars':'star_rating'}, inplace = True)

In [None]:
review_data.isnull().sum()
review_data.dropna()

In [None]:
X = review_data['full_review_text']
y = review_data['star_rating']

In [None]:
if not os.path.exists('../pickle_files'):
    os.mkdir('../pickle_files')

In [None]:
if not os.path.exists("../pickle_files/word2vec_model_sklearn.pickle"):
    print("Creating Embedding From Scratch.")
    count_model = CountVectorizer()
    word2vec_model_sklearn = count_model.fit_transform(X.apply(lambda x: np.str_(x)))
    pickle_out = open("../pickle_files/word2vec_model_sklearn.pickle",'wb')
    pickle.dump(word2vec_model_sklearn,pickle_out)
    pickle_out.close()
else:
    print("Found Pickle File.")
    word2vec_model_sklearn = pickle.load(open("../pickle_files/word2vec_model_sklearn.pickle",'rb'))

In [None]:
word2vec_model_gensim = Word2Vec(X, min_count=1, vector_size=100)
word2vec_model_gensim.save('../pickle_files/word2vec_model_gensim.model')