# Label Predicted Dialect for Tweet Dataset
#### Using model from the paper "Demographic Dialectal Variation in Social Media: A Case Study of African-American English" by Su Lin Blodgett, Lisa Green, and Brendan O'Connor, EMNLP 2016.

Note: Requires python 2.7

In [1]:
import pandas as pd
import csv

import twitteraae.code.predict as predict

In [2]:
DATA_FOLDER = "compiled_data"
LABELED_DATASET_NAME = "combined_labeled_data.csv"
OUTPUT_DATASET_NAME = "dialect_metric_data.csv"

In [3]:
# Initialize prediction model
predict.load_model()

In [4]:
dataset = pd.read_csv("{}/{}".format(DATA_FOLDER, LABELED_DATASET_NAME))

output_dataset_column_names = ["tweet_text", "tweet_label", "african_american_dialect_proportion", "hispanic_dialect_proportion", "asian_dialect_proportion", "white_dialect_proportion"]
output_dataset_rows = []
for row in dataset.itertuples():
    dialect_prediction = predict.predict(row.tweet_text.decode().split())
    
    # If too few vocab words are recognized, predict may output None
    if dialect_prediction is None:
        continue
    
    aae, he, ae, we = dialect_prediction
    output_dataset_rows.append(
        [row.tweet_text, row.tweet_label, aae, he, ae, we]
    )
    
output_dataset = pd.DataFrame(output_dataset_rows, columns=output_dataset_column_names)
output_dataset.to_csv("{}/{}".format(DATA_FOLDER, OUTPUT_DATASET_NAME), index=False)

In [5]:
print(len(dataset))
print(len(output_dataset))

21062
21029
