In [None]:
import numpy as np
import pandas as pd

In [None]:
!unzip dataset.csv.zip

# Fetch

In [None]:
dataset = pd.read_csv("dataset.csv")
dataset.head()

In [None]:
dataset.describe()

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Language'].unique()

In [None]:
dataset['Language'].fillna('', inplace=True)

# Identificar as linhas em que o texto é uma foto
mask_photo = dataset['Language'].astype(str).str.startswith("[Photo")

# Remover as linhas em que o texto é uma foto
dataset = dataset[~mask_photo]


# Certificar-se de lidar com valores NaN adequadamente se houver
dataset.dropna(subset=['Language'], inplace=True)

# Verificar se as linhas foram removidas
print(dataset['Language'].unique())


In [None]:
# Identificar as linhas em que o texto é uma foto
mask_photo = dataset['Language'].astype(str).str.startswith("[Video")

# Remover as linhas em que o texto é uma foto
dataset = dataset[~mask_photo]

# Certificar-se de lidar com valores NaN adequadamente se houver
dataset.dropna(subset=['Language'], inplace=True)

# Verificar se as linhas foram removidas
print(dataset['Language'].unique())

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('all')

In [None]:
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()

    # Remove square brackets and contents inside
    text = re.sub(r'\[.*?\]', '', text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>+', '', text)

    # Remove punctuation
    text = re.sub(rf'[{re.escape(string.punctuation)}]', '', text)

    # Remove newline characters
    text = re.sub(r'\n', '', text)

    # Remove words containing digits
    text = re.sub(r'\w*\d\w*', '', text)

    return text

In [None]:
dataset['Text'] = dataset['Text'].apply(lambda x:clean_text(x))

dataset.head()

In [None]:
dataset['Language'] = dataset['Language'].apply(lambda x:clean_text(x))

dataset.head()

In [None]:
dataset['Language'].unique()

In [None]:
def remove_stopwords(text):
    if pd.notnull(text):
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(filtered_tokens)
    return text
def lemmatize_text(text):
    if pd.notnull(text):
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        lemmatized_text = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized_text)
    return text

In [None]:
# Rest of your code
dataset['Text'] = dataset['Text'].apply(lambda x: remove_stopwords(x))


dataset.head()

In [None]:
dataset['Text'] = dataset['Text'].apply(lambda x: lemmatize_text(x))
dataset.head()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,accuracy_score
import mlflow
from sklearn.naive_bayes import MultinomialNB

In [None]:
# separar
tweets = dataset['Text']
language_labels = dataset['Language']
sentiment_labels = dataset['Label']

In [None]:
def data_train(mlflow_experiment_id):

    tweets_train, tweets_test, lang_labels_train, lang_labels_test, sent_labels_train, sent_labels_test = train_test_split(
            tweets, language_labels, sentiment_labels, test_size=0.2, random_state=42
        )
   

    with mlflow.start_run(experiment_id=mlflow_experiment_id):
        # classificacao de language
        language_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', LinearSVC())
        ])
        # Train the language  classification model
        language_pipeline.fit(tweets_train, lang_labels_train)


        lang_predictions = language_pipeline.predict(tweets_test)

        # Avaliando o desempenho
        lang_accuracy = accuracy_score(lang_labels_test, lang_predictions)
        lang_report = classification_report(lang_labels_test, lang_predictions)

        mlflow.log_metric("lang_accuracy", lang_accuracy)
        mlflow.log_metric("lang_report", lang_report)


        
        # Define the pipeline for sentiment classification
        sentiment_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ])
        # Train the sentiment classification model
        sentiment_pipeline.fit(tweets_train, sent_labels_train)

        # Avaliar o modelo de identificação de sentimento
        sent_predictions = sentiment_pipeline.predict(tweets_test)

        sent_accuracy = accuracy_score(lang_labels_test, sent_predictions)
        sent_report = classification_report(sent_labels_test, sent_predictions)

        mlflow.log_metric("sent_accuracy", sent_accuracy)
        mlflow.log_metric("lang_report", sent_report)

        


In [None]:
data_train("0")

In [None]:
tweets_train, tweets_test, lang_labels_train, lang_labels_test, sent_labels_train, sent_labels_test = train_test_split(
            tweets, language_labels, sentiment_labels, test_size=0.2, random_state=42
        )

In [None]:
tweets_train.head()

In [None]:
lang_labels_train.head()

In [None]:
sent_labels_train.head()

In [None]:
sent_labels_train.unique()

In [None]:
# classificacao de language
language_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])
# Train the language  classification model
language_pipeline.fit(tweets_train, lang_labels_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Define the pipeline for sentiment classification
sentiment_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])
# Train the sentiment classification model

In [None]:
# Predict the language of a tweet
tweet = "Predict the language of a tweet"
predicted_language = language_pipeline.predict([tweet])[0]
print("Predicted language:", predicted_language)

In [None]:
# Predict the sentiment of a tweet
tweet = "reasonable material"
predicted_sentiment = sentiment_pipeline.predict([tweet])[0]
print("Predicted sentiment:", predicted_sentiment)

In [None]:
# Avaliar o modelo de identificação de idioma
lang_predictions = language_pipeline.predict(tweets_test)
lang_report = classification_report(lang_labels_test, lang_predictions)
print("Language identification report:\n", lang_report)

In [None]:
# Avaliar o modelo de identificação de sentimento
sent_predictions = sentiment_pipeline.predict(tweets_test)
sent_report = classification_report(sent_labels_test, sent_predictions)
print("Sentiment classification report:\n", sent_report)