In [4]:
import requests
import zipfile
import pandas as pd
from io import BytesIO

# Define the URL of the zip file
zip_file_url = 'https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip'

# Send an HTTP GET request to download the zip file
response = requests.get(zip_file_url)

# Check if the request was successful
if response.status_code == 200:
    # Read the zip file
    with zipfile.ZipFile(BytesIO(response.content)) as zip_file:
        # Assuming there's only one .tsv file in the zip, you can read it like this
        tsv_file = zip_file.namelist()[0]
        with zip_file.open(tsv_file) as tsv:
            # Read the .tsv file into a DataFrame
            train_text = pd.read_csv(tsv, sep='\t')

            # Display the head of the DataFrame
            head = train_text.head()
            print(head)
else:
    print("Failed to download the zip file.")

   Unnamed: 0                                          reference   
0           0  If Alkar is flooding her with psychic waste, t...  \
1           1                          Now you're getting nasty.   
2           2           Well, we could spare your life, for one.   
3           3          Ah! Monkey, you've got to snap out of it.   
4           4                   I've got orders to put her down.   

                                         translation  similarity  lenght_diff   
0  if Alkar floods her with her mental waste, it ...    0.785171     0.010309  \
1                        you're becoming disgusting.    0.749687     0.071429   
2                      well, we can spare your life.    0.919051     0.268293   
3                       monkey, you have to wake up.    0.664333     0.309524   
4                         I have orders to kill her.    0.726639     0.181818   

    ref_tox   trn_tox  
0  0.014195  0.981983  
1  0.065473  0.999039  
2  0.213313  0.985068  
3  0.053

In [5]:
head

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [6]:
from sklearn.model_selection import train_test_split

X = train_text.reference

y = train_text['ref_tox']

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, random_state=123)

In [7]:
from sklearn.feature_extraction import _stop_words
import string

stop_words = _stop_words.ENGLISH_STOP_WORDS

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punct = text.translate(translator)
    return text_without_punct

def cleaning(doc):
    # Remove punctuation
    doc = remove_punctuation(doc)

    doc = ''.join([word for word in doc if not word.isdigit()])

    # Remove stop_words
    tokens = doc.split()
    cleaned_tokens = [token for token in tokens if token not in stop_words]
    cleaned_doc = ' '.join(cleaned_tokens)

    return cleaned_doc

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=5000, preprocessor=cleaning)
X_train_bow = vect.fit_transform(X_train)
X_val_bow = vect.fit_transform(X_val)

In [9]:
pd.DataFrame(X_train_bow.A[:5], columns=vect.get_feature_names_out())

Unnamed: 0,AIDS,AND,Aah,About,According,Actually,Adam,Admiral,Africa,African,...,young,younger,youre,youth,youve,zero,zombie,zombies,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn import preprocessing
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y_train)

classifier = LogisticRegression()
classifier.fit(X_train_bow, y_transformed)

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error
y_val_transformed = lab.fit_transform(y_val)
y_pred = classifier.predict(X_val)
accuracy = accuracy_score(y_val_transformed, y_pred)
mse = mean_squared_error(y_val_transformed, y_pred)

In [57]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

# Example data (replace with your data)
X_train = ["This is a toxic sentence.", "This is a non-toxic sentence.", "Another toxic example."]
y_train = np.array([0.8, 0.1, 0.9])  # Toxicity labels between 0 and 1

# Create a CountVectorizer to convert words to vectors
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_counts, y_train, test_size=0.2, random_state=42)

# Create and train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier (you can choose a different metric)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Accuracy:", accuracy)
print("Mean Squared Error:", mse)

# Example: Predict toxicity of a new sentence
new_sentence = ["This is a neutral sentence."]
new_sentence_counts = vectorizer.transform(new_sentence)
toxicity_score = classifier.predict(new_sentence_counts)
print("Toxicity Score:", toxicity_score[0])

ValueError: Unknown label type: 'continuous'