
### Plot confusion matrix

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

labels = ["positive","negative","normal"]
# figure object
plt.figure(figsize=(7, 7))
cm = [[1365024.,39645.,24020.],[54710.,420324.,21444.],[ 119578.,62023.,44302.]]
# plot confusion matrix
sns.heatmap(cm,
            cmap='viridis',
            annot=True,fmt='0',
            cbar=False, 
            xticklabels=labels, 
            yticklabels=labels)
plt.show()

### check word in summary

In [None]:
import pandas as pd
import re 
import spacy  # For preprocessing
df = pd.read_csv("data_set_tiktok_update_2_result.csv",low_memory=False)
df.info()
df2 = df[['playCount','commentCount','diggCount','shareCount','summary','object']]
def clean_1(text):
    try:
        return text.replace("In the video,", "")
    except:
        return text
def clean_2(text):
    try:
        return text.replace("The video", "")
    except:
        return text
df2['summary'] = df2['summary'].apply(clean_1)
df2['object'] = df2['object'].apply(clean_2)

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_lg",disable=['ner', 'parser'])
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    removelst = ['cq', 'rg', 'bb']
    txt = [w for w in txt if w not in removelst]
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df2['summary'])
df2['clean'] = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, n_process=-1, batch_size=5000)]
object_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df2['object'])
df2['object_clen'] = [cleaning(doc) for doc in nlp.pipe(object_cleaning, n_process=-1, batch_size=5000)]
del nlp
df2 = df2.dropna().reset_index(drop=True)
# df2['token'] = df2['clean'].apply(lambda x: x.split())
df2.head()

In [None]:
df2['object_clen_split'] = df2['object_clen'].apply(lambda x: x.split())
del_lst = ['video','appear','show','hold','possibly','engage','engage','wear','scene','black','suggest','likely','casual','content','close','talk','setting','provide','person']
def del_word(x):
    return list(set([text for text in x if text not in del_lst]))
df2['object_clen_split'] = df2['object_clen_split'].apply(del_word)
df_exploded = df2['object_clen_split'].explode('object_clen_split')
x = df_exploded.value_counts()
print(x)

In [None]:
df2.to_csv("video_clean.csv",index=False)
# spark_data = spark.createDataFrame(df2)

### summary tiktok video

In [None]:
import pandas as pd
dfx = pd.read_csv("data_video_predict.csv",low_memory=False)
dfx.info()

0.0 -> 1.0 positive \
1.0 -> -1.0 negative \
2.0 -> 0.0 normal 

In [None]:
dfx2 = dfx[['playCount','commentCount','diggCount','shareCount','summary','prediction']]
def return_val(x):
    if x == 0.0:
        return "positive"
    elif x == 1.0:
        return "negative"
    elif x == 2.0:
        return "normal"
dfx2['prediction_val'] = dfx2['prediction'].apply(return_val)
dfx2['prediction_val'].value_counts()

In [None]:
df_pos = dfx2[dfx2['prediction_val']=="positive"]
df_neg = dfx2[dfx2['prediction_val']=="negative"]
print(df_pos.describe().round(decimals=2))
print(df_neg.describe().round(decimals=2))

### check similarity 

In [None]:
import pandas as pd
dfx = pd.read_csv("data_set_tiktok_update_2_result.csv",low_memory=False)
dfx = dfx[['webVideoUrl','summary','object']]
dfx.info()
dfxx = pd.read_csv("validate-videosummary-f15.csv",low_memory=False)
dfxx.info()
df_merge = pd.merge(dfxx,dfx,on="webVideoUrl",how="left")
df_merge.info()

In [None]:
import spacy 
def clean_1(text):
    try:
        return text.replace("In the video,", "")
    except:
        return text
def clean_2(text):
    try:
        return text.replace("The video shows", "")
    except:
        return text
df_merge['summary'] = df_merge['summary'].apply(clean_1)
df_merge['summary_from_human'] = df_merge['summary_from_human'].apply(clean_2)

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_lg",disable=['ner', 'parser'])
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    removelst = ['cq', 'rg', 'bb']
    txt = [w for w in txt if w not in removelst]
    if len(txt) > 2:
        return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_merge['summary'])
df_merge['summary_clean'] = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, n_process=-1, batch_size=5000)]
summary_from_human_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_merge['summary_from_human'])
df_merge['summary_human_clen'] = [cleaning(doc) for doc in nlp.pipe(summary_from_human_cleaning, n_process=-1, batch_size=5000)]

def similarity_check(row):
    text1 = nlp(row['summary_human_clen'])
    text2 = nlp(row['summary_clean'])
    return text2.similarity(text1)
df_merge['similarity'] = df_merge.apply(similarity_check,axis=1)

In [None]:
df_merge['similarity'].describe()