In [None]:
from nlp_ai_utils import *
from chunks_urls import CHUNKS_URLS

c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll


In [None]:
TF_ENABLE_ONEDNN_OPTS = 0
URLS = CHUNKS_URLS
LIMIT = 60
LARGER_DATASET_PATH = "../larger_dataset"
PREPROCESSED_CHUNKS_PATH = LARGER_DATASET_PATH + "/preprocessed_data_chunks/"
BASE_FILE_NAME = "chunk_"
FILE_FORMAT = ".csv"
ACTUAL_DATA_SHAPE = 6990280
RANDOM_STATE = 42

In [None]:
def get_chunks(urls,limit=0,verbose = 1,base_name = "",file_path="",file_format='.csv'):
    #downloads all data from their urls
    for i,url in enumerate(urls):
        if limit:
            if i == limit:
                return
        file_name = base_name + str(i + 1)
        #checks if file already exists
        if os.path.exists(file_path + file_name + file_format):
            print(f"{file_name} already exists.")
            continue
        if i % verbose == 0:
            print(f"Downloading {file_name}...")
        r = requests.get(url)
        with open(file_path + file_name + file_format, 'wb') as fd:
            for chunk in r.iter_content():
                #save file in the current directory of the notebook
                fd.write(chunk)

In [None]:
def get_all_file_names(base_name,limit_num):
    return [base_name + str(num) for num in range(1,limit_num + 1)]

In [None]:
def read_chunks(files,file_path = "",file_format = ".csv"):
    #reads chunks csvs and converts them to a dataframe format
    final_df = []
    for file in files:
        df = pd.read_csv(file_path + file + file_format)
        final_df.append(df)
    #function returns a list of dfs
    return final_df

In [None]:
#get all names of downloaded files
all_file_names = get_all_file_names(BASE_FILE_NAME,LIMIT)

In [None]:
#read all chunks into a list
list_dfs = read_chunks(all_file_names,PREPROCESSED_CHUNKS_PATH,FILE_FORMAT)

In [None]:
def group_up_chunks(dfs):
    #adds up all dataframes together
    return pd.concat(dfs)

In [None]:
#concatenate all chunks into a singular df
df = group_up_chunks(list_dfs)

In [None]:
review_data = df[['text', 'stars']].copy()

In [None]:
review_data.reset_index(inplace = True)
review_data.drop(['index'],axis = 1,inplace = True)

In [None]:
review_data.rename(columns = {'text':'full_review_text','stars':'star_rating'}, inplace = True)

In [None]:
review_data.isnull().sum()
review_data.dropna()

Unnamed: 0,full_review_text,star_rating
0,decide eat aware go take 2 hour begin end try ...,3.0
1,ve take lot spin class year nothing compare cl...,5.0
2,family diner buffet eclectic assortment large ...,3.0
3,wow yummy different delicious favorite lamb cu...,5.0
4,cute interior owner give u tour upcoming patio...,4.0
...,...,...
6990275,late addition service iccu apple pay iccu debi...,5.0
6990276,spot offer great affordable east weekend paddl...,5.0
6990277,home depot need get lot demential lumber seem ...,4.0
6990278,m feel like ignore caloriecounting indulge fla...,5.0


In [None]:
X = review_data['full_review_text']
y = review_data['star_rating']

In [None]:
X = pd.Series([str(text) for text in X])

In [None]:
review_data.isnull().sum()

full_review_text    44
star_rating          0
dtype: int64

In [None]:
review_data.dropna()

Unnamed: 0,full_review_text,star_rating
0,decide eat aware go take 2 hour begin end try ...,3.0
1,ve take lot spin class year nothing compare cl...,5.0
2,family diner buffet eclectic assortment large ...,3.0
3,wow yummy different delicious favorite lamb cu...,5.0
4,cute interior owner give u tour upcoming patio...,4.0
...,...,...
6990275,late addition service iccu apple pay iccu debi...,5.0
6990276,spot offer great affordable east weekend paddl...,5.0
6990277,home depot need get lot demential lumber seem ...,4.0
6990278,m feel like ignore caloriecounting indulge fla...,5.0


In [None]:
if not os.path.exists('../pickle_files'):
    os.mkdir('../pickle_files')

In [None]:
count_model = CountVectorizer()

In [None]:
if not os.path.exists("../pickle_files/word2vec_model_sklearn.pickle"):
    print("Creating Embedding From Scratch.")
    word2vec_model_sklearn = count_model.fit_transform(X.apply(lambda x: np.str_(x)))
    pickle_out = open("../pickle_files/word2vec_model_sklearn.pickle",'wb')
    pickle.dump(word2vec_model_sklearn,pickle_out)
    pickle_out.close()
else:
    print("Found Pickle File.")
    word2vec_model_sklearn = pickle.load(open("../pickle_files/word2vec_model_sklearn.pickle",'rb'))

Found Pickle File.


In [None]:
word2vec_model_sklearn_array = word2vec_model_sklearn.toarray()

In [None]:
x_train,x_test,y_train,y_test = create_train_test_split(word2vec_model_sklearn_array,y,test_size=0.2)

### Data Resampling

In [None]:
smote_resampler = SMOTE(random_state=RANDOM_STATE)
x_train,y_train = smote_resampler.fit_resample(x_train,y_train)

### Comparisons

#### Sentiment Polarity

In [None]:
sid = SentimentIntensityAnalyzer()

sent_polarity_info = [sid.polarity_scores(review) for review in review_data['full_review_text']]

sent_polarity_info

In [None]:
review_sentiment = [classify_sentiment(scores) for scores in sent_polarity_info]

sent_polarity = [extract_sent_polarity(scores) for scores in sent_polarity_info]


review_data['str_sent'] = review_sentiment

review_data['sent_polarity'] = sent_polarity

In [None]:
sentiment_labels = translate_labels(y)
y_true_sent = encode_sent(sentiment_labels)
y_pred_sent = encode_sent(df['str_sent'])

In [None]:
print(metrics.accuracy_score(y_true_sent,y_pred_sent))

#### Machine Learning Models

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100,random_state=RANDOM_STATE)
svm_clf = SVC(kernel='rbf',random_state=RANDOM_STATE)
mnnb_clf = MultinomialNB(random_state = RANDOM_STATE)
xgb_clf = XGBClassifier(random_state = RANDOM_STATE)

In [None]:
y_train_xgb,y_test_xgb = adjust_xgb_labels(y_train,y_test)

In [None]:
rf_clf.fit(x_train,y_train)
svm_clf.fit(x_train,y_train)
mnnb_clf.fit(x_train,y_train)
xgb_clf.fit(x_train,y_train_xgb)
y_pred_rf = rf_clf.predict(x_test)
y_pred_svm = svm_clf.predict(x_test)
y_pred_mnnb = mnnb_clf.predict(x_test)

In [None]:
show_metrics(rf_clf,x_test,y_test,y_pred_rf,word2vec_model_sklearn_array,y)
show_metrics(svm_clf,x_test,y_test,y_pred_rf,word2vec_model_sklearn_array,y)
show_metrics(mnnb_clf,x_test,y_test,y_pred_rf,word2vec_model_sklearn_array,y)
show_metrics(xgb_clf,x_test,y_test_xgb,y_pred_rf,word2vec_model_sklearn_array,y)