In [1]:
import pandas as pd
from googletrans import Translator
from camel_tools.sentiment import SentimentAnalyzer
from transformers import pipeline

In [2]:
df = pd.read_csv('/Users/mindyshiben/codeup-data-science/arabic_media_nlp_project/block_3.csv')

In [3]:
subset_half = df[60000:]

In [4]:
subset_half.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59255 entries, 60000 to 119254
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        59255 non-null  object
 1   url       59255 non-null  object
 2   headline  59254 non-null  object
 3   dateline  59255 non-null  object
 4   text      59255 non-null  object
 5   tags      59255 non-null  object
 6   source    59255 non-null  object
dtypes: object(7)
memory usage: 3.2+ MB


In [5]:
subset_half.to_csv('subset_half.csv', index=False)

In [6]:
subset_3 = df[10000:15000]

In [7]:
subset_3.to_csv('block_3_part_3.csv', index=False)

In [8]:
msa = pipeline('text-classification', model="CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
name = 'subset_3'


def load_and_label_df(name):
    df = load_csv(name+'.csv')
    print(f'loaded {name}.csv')
    print('labeling/scoring...')
    df = create_labels_scores(df, name)
    print('done labeling/scoring!')
    return df

def load_csv(filename):
    df = pd.read_csv(filename)
    df['text_label'] = 'invalid'
    df['text_score'] = 'invalid'
    df['headline_label'] = 'invalid'
    df['headline_score'] = 'invalid'
    return df

def make_msa(df_text):
    try:
        done = msa(df_text)
        return done
    except:
        
        try:
            first_half = msa(df_text[:round(len(df_text)/2)]) 
            second_half = msa(df_text[round(len(df_text)/2):])
            if first_half[0]['label'] == second_half[0]['label']:
                label = first_half[0]['label']
                score = (first_half[0]['score'] + second_half[0]['score'])/2
            done = [{'label': label, 'score': score}]
            return done
        except:
            return [{'label': 'unlabeled', 'score': 'unscored'}]
        
def analyze_text(df):
    scores = []
    scores = df.text.apply(make_msa)
    return scores

def analyze_headline(df):
    headline_scores = []
    scores = df.headline.apply(make_msa)
    return scores

def label_and_scores(msa_scores):
    labels = []
    scores = []
    for val in msa_scores:
        try:
            labels.append(val[0]['label'])
            scores.append(val[0]['score'])
        except:
            labels.append(False)
            scores.append(False)

    return labels, scores

def create_labels_scores(df, name):
    text_scores = analyze_text(df)
    labels, scores = label_and_scores(text_scores)
    df['text_label'] = labels
    df['text_score'] = scores

    headline_scores = analyze_headline(df)
    labels, scores = label_and_scores(headline_scores)
    df['headline_label'] = labels
    df['headline_score'] = scores

    # CHANGE 'BLOCK_NAME' TO WHATEVER YOU WANT
    df.to_csv('labeled_part_3_'+ name + '.csv', index=False)

    return df

In [None]:
sub_3 = create_labels_scores(df, name)