Data description at the [codalab 26655 competition](https://competitions.codalab.org/competitions/26655#learn_the_details).


# Load the data


In [None]:
import pandas as pd


train = pd.read_excel("data/Constraint_English_Train.xlsx")
validation = pd.read_excel("data/Constraint_English_Val.xlsx")
test = pd.read_excel("data/Constraint_English_Test.xlsx")


Since this is a unsupervised learning problem, we'll join all the data in a single dataframe.
We'll also drop the columns of the id.


In [None]:
train = train.drop(columns=["id"])
validation = validation.drop(columns=["id"])
test = test.drop(columns=["id"])


In [None]:
dataframe = pd.concat([train, validation, test])


Show the most common words in the dataset.
We'll also divide according to if it's a real or fake.


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


In [None]:
plt.imshow(
    WordCloud().generate(" ".join(dataframe["tweet"])),
    interpolation="bilinear",
)
plt.axis("off")
plt.show()


In [None]:
plt.imshow(
    WordCloud().generate(" ".join(dataframe[dataframe["label"] == "fake"]["tweet"])),
    interpolation="bilinear",
)
plt.axis("off")
plt.show()


In [None]:
plt.imshow(
    WordCloud().generate(" ".join(dataframe[dataframe["label"] == "real"]["tweet"])),
    interpolation="bilinear",
)
plt.axis("off")
plt.show()


In [None]:
from nltk.tokenize import word_tokenize


In [None]:
dataframe["tokens"] = dataframe["tweet"].apply(word_tokenize)


In [None]:
import re


Remove urls

In [None]:
url_rx = re.compile(r'(http(s)?).+|www\..+')
dataframe['tokens'] = dataframe['tokens'].apply(lambda tokens: [word for word in tokens if not url_rx.match(word)])


In [None]:
from nltk.stem import WordNetLemmatizer


In [None]:
from sentence_transformers import SentenceTransformer


In [None]:
from transformers import AutoTokenizer, AutoModel


In [None]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(dataframe['sentence'], show_progress_bar=True)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
model = AutoModel.from_pretrained("openai/clip-vit-base-patch32")


In [None]:
lemmatizer = WordNetLemmatizer()
dataframe['tokens'] = dataframe['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
