In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
trainData = pd.read_csv('/content/drive/MyDrive/finalDataset.csv')
trainData = trainData.dropna()
print(trainData.head())

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  \
0  Afghanistan          38928346         652860.0   

In [6]:
clean_tweets = trainData['cleaned_text']
print(clean_tweets)

0                                          responded going
1                                  soon sad miss san diego
2                                             boy building
3                                    interview leave alone
4                     son could put release already bought
                               ...                        
27475    wish could come see denver husband lost job af...
27476    wondered rake client made clear net force des ...
27477    may good enjoy break probably need hectic week...
27478                                                worth
27479                      flirting going at smile may hug
Name: cleaned_text, Length: 27368, dtype: object


ngram_range = (1,2) --> since it considers unigrams (aka individual words) and two-word combinations (bigrams) too so it takes better context such as 'machine learning' or 'data science'.

In [8]:
vectorizer = TfidfVectorizer(max_features=None, ngram_range = (1,2))
train_data_features = vectorizer.fit_transform(clean_tweets)
print(train_data_features.shape)

(27368, 132139)


In [9]:
y_train = trainData['sentiment']

In [10]:
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model.fit(train_data_features, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
testData = pd.read_csv('/content/drive/MyDrive/finalDatasetTest.csv')
testData = testData.dropna()
print(testData.head())

       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0        1246700.0   

   Density (P/Km²)                                      

In [11]:
clean_tweets_test = testData['cleaned_text']
print(clean_tweets_test)

0                                        last session day
1       shanghai also really exciting precisely skyscr...
2       recession hit veronique branquinho quit compan...
3                                          happy birthday
4                                                    like
                              ...                        
3529                                      tired sleep try
3530    alone old house thanks net keep alive kicking ...
3531    know mean little dog sinking depression want m...
3532            supra next couture video going love video
3533                                    omgssh and cut by
Name: cleaned_text, Length: 3517, dtype: object


In [12]:
y_test = testData['sentiment']

In [13]:
test_data_features = vectorizer.transform(clean_tweets_test)
result = model.predict(test_data_features)
output = pd.DataFrame( data={"id":testData["textID"], "sentiment":result} )

In [None]:
output.to_csv("resultPredictionsOnTest.csv", index=False, quoting=3 )

In [None]:
accuracy = accuracy_score(y_test, result)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test, result, average='weighted', zero_division=0) # weighted since is multiclass classification
recall = recall_score(y_test, result, average='weighted', zero_division=0)
f1 = f1_score(y_test, result, average='weighted', zero_division=0)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Print a classification report
print("\nClassification Report:")
print(classification_report(y_test, result))

Accuracy: 0.6960
Precision: 0.7006
Recall: 0.6960
F1 Score: 0.6962

Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.62      0.66       999
     neutral       0.64      0.72      0.68      1415
    positive       0.76      0.74      0.75      1103

    accuracy                           0.70      3517
   macro avg       0.71      0.69      0.70      3517
weighted avg       0.70      0.70      0.70      3517



NEG SPACY

In [14]:
pip install negspacy

Collecting negspacy
  Downloading negspacy-1.0.4.tar.gz (13 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: negspacy
  Building wheel for negspacy (pyproject.toml) ... [?25l[?25hdone
  Created wheel for negspacy: filename=negspacy-1.0.4-py3-none-any.whl size=12537 sha256=7ef3a88a3c3f25b30eb0aaf825719028cc76d43006189d6841cc187863e573aa
  Stored in directory: /root/.cache/pip/wheels/15/31/f0/3a217aaedf320e4df653347cd6538f3648263b864c8e140853
Successfully built negspacy
Installing collected packages: negspacy
Successfully installed negspacy-1.0.4


In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'en_core_web_sm'

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import spacy
from spacy.tokens import Token
from negspacy.negation import Negex
from negspacy.termsets import termset

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Set up the termset and get patterns
ts = termset("en")

# Register the 'negex' extension on tokens to avoid any missing extension error
Token.set_extension("negex", default=False)

# Add Negex component with configuration using termset patterns
nlp.add_pipe("negex", last=True, config={
    "neg_termset": ts.get_patterns(),
    "ent_types": ["PERSON", "NOUN"],  # Relevant entity types for negation analysis
    "chunk_prefix": ["no", "not", "never", "without"]  # Define common negation indicators
})

# Load training data
trainData = pd.read_csv('/content/drive/MyDrive/finalDataset.csv')
trainData = trainData.dropna()
print(trainData.head())

# Clean tweets
clean_tweets = trainData['cleaned_text']
print(clean_tweets)

# Analyze negation with Negex
def analyze_negation(text):
    doc = nlp(text)
    return " ".join([token.text + "NEG" if token._.negex else token.text for token in doc])

# Apply Negex analysis to cleaned tweets
clean_tweets = clean_tweets.apply(analyze_negation)

# Vectorize tweets
vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(clean_tweets)
print(train_data_features.shape)

# Get labels
y_train = trainData['sentiment']

# Train model
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model.fit(train_data_features, y_train)

# Load test data
testData = pd.read_csv('/content/drive/MyDrive/finalDatasetTest.csv')
testData = testData.dropna()
print(testData.head())

# Clean test tweets
clean_tweets_test = testData['cleaned_text']
print(clean_tweets_test)

# Analyze negation on test data
clean_tweets_test = clean_tweets_test.apply(analyze_negation)

# Get true labels
y_test = testData['sentiment']

# Vectorize test data
test_data_features = vectorizer.transform(clean_tweets_test)
result = model.predict(test_data_features)

# Create output DataFrame
output = pd.DataFrame(data={"id": testData["textID"], "sentiment": result})

# Save predictions to CSV
output.to_csv("resultPredictionsOnTest.csv", index=False, quoting=3)

# Calculate and print metrics
accuracy = accuracy_score(y_test, result)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test, result, average='weighted', zero_division=0)
recall = recall_score(y_test, result, average='weighted', zero_division=0)
f1 = f1_score(y_test, result, average='weighted', zero_division=0)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Print a classification report
print("\nClassification Report:")
print(classification_report(y_test, result))





       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  \
0  Afghanistan          38928346         652860.0   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0        1246700.0   

   Density (P/Km²)                                      

In [50]:
negative_words = [
    "2-faced", "2-faces", "abnormal", "abolish", "abominable", "abominably", "abominate",
    "abomination", "abort", "aborted", "aborts", "abrade", "abrasive", "abrupt", "abruptly",
    "abscond", "absence", "absent-minded", "absentee", "absurd", "absurdity", "absurdly",
    "absurdness", "abuse", "abused", "abuses", "abusive", "abysmal", "abysmally", "abyss",
    "accidental", "accost", "accursed", "accusation", "accusations", "accuse", "accuses",
    "accusing", "accusingly", "acerbate", "acerbic", "acerbically", "ache", "ached", "aches",
    "achey", "aching", "acrid", "acridly", "acridness", "acrimonious", "acrimoniously",
    "acrimony", "adamant", "adamantly", "addict", "addicted", "addicting", "addicts",
    "admonish", "admonisher", "admonishingly", "admonishment", "admonition", "adulterate",
    "adulterated", "adulteration", "adulterier", "adversarial", "adversary", "adverse",
    "adversity", "afflict", "affliction", "afflictive", "affront", "afraid", "aggravate",
    "aggravating", "aggravation", "aggression", "aggressive", "aggressiveness", "aggressor",
    "aggrieve", "aggrieved", "aggrivation", "aghast", "agonies", "agonize", "agonizing",
    "agonizingly", "agony", "aground", "ail", "ailing", "ailment", "aimless", "alarm",
    "alarmed", "alarming", "alarmingly", "alienate", "alienated", "alienation", "allegation",
    "allegations", "allege", "allergic", "allergies", "allergy", "aloof", "altercation",
    "ambiguity", "ambiguous", "ambivalence", "ambivalent", "ambush", "amiss", "amputate",
    "anarchism", "anarchist", "anarchistic", "anarchy", "anemic", "anger", "angrily",
    "angriness", "angry", "anguish", "animosity", "annihilate", "annihilation", "annoy",
    "annoyance", "annoyances", "annoyed", "annoying", "annoyingly", "annoys", "anomalous",
    "anomaly", "antagonism", "antagonist", "antagonistic", "antagonize", "anti-", "anti-american",
    "anti-israeli", "anti-occupation", "anti-proliferation", "anti-semites", "anti-social",
    "anti-us", "anti-white", "antipathy", "antiquated", "antithetical", "anxieties", "anxiety",
    "anxious", "anxiously", "anxiousness", "apathetic", "apathetically", "apathy", "apocalypse",
    "apocalyptic", "apologist", "apologists", "appal", "appall", "appalled", "appalling",
    "appallingly", "apprehension", "apprehensions", "apprehensive", "apprehensively", "arbitrary",
    "arcane", "archaic", "arduous", "arduously", "argumentative", "arrogance", "arrogant",
    "arrogantly", "ashamed", "asinine", "asininely", "asinininity", "askance", "asperse",
    "aspersion", "aspersions", "assail", "assassin", "assassinate", "assault", "assult",
    "astray", "asunder", "atrocious", "atrocities", "atrocity", "atrophy", "attack", "attacks",
    "audacious", "audaciously", "audaciousness", "audacity", "audiciously", "austere",
    "authoritarian", "autocrat", "autocratic", "avalanche", "avarice", "avaricious",
    "avariciously", "avenge", "averse", "aversion", "aweful", "awful", "awfully", "awfulness",
    "awkward", "awkwardness", "ax", "babble", "back-logged", "back-wood", "back-woods",
    "backache", "backaches", "backaching", "backbite", "backbiting", "backward", "backwardness",
    "backwood", "backwoods", "bad", "badly", "baffle", "baffled", "bafflement", "baffling",
    "bait", "balk", "banal", "banalize", "bane", "banish", "banishment", "bankrupt", "barbarian", "barbaric", "barbarically", "barbarity", "barbarous",
    "barbarously", "barren", "baseless", "bash", "bashed", "bashful", "bashing", "bastard",
    "bastards", "battered", "battering", "batty", "bearish", "beastly", "bedlam", "bedlamite",
    "befoul", "beg", "beggar", "beggarly", "begging", "beguile", "belabor", "belated",
    "beleaguer", "belie", "belittle", "belittled", "belittling", "bellicose", "belligerence",
    "belligerent", "belligerently", "bemoan", "bemoaning", "bemused", "bent", "berate",
    "bereave", "bereavement", "bereft", "berserk", "beseech", "beset", "besiege", "besmirch",
    "bestial", "betray", "betrayal", "betrayals", "betrayer", "betraying", "betrays", "bewail",
    "beware", "bewilder", "bewildered", "bewildering", "bewilderingly", "bewilderment", "bewitch",
    "bias", "biased", "biases", "bicker", "bickering", "bid-rigging", "bigotries", "bigotry",
    "bitch", "bitchy", "biting", "bitingly", "bitter", "bitterly", "bitterness", "bizarre",
    "blab", "blabber", "blackmail", "blah", "blame", "blameworthy", "bland", "blandish",
    "blaspheme", "blasphemous", "blasphemy", "blasted", "blatant", "blatantly", "blather", "bleak",
    "bleakly", "bleakness", "bleed", "bleeding", "bleeds", "blemish", "blind", "blinding",
    "blindingly", "blindside", "blister", "blistering", "bloated", "blockage", "blockhead",
    "bloodshed", "bloodthirsty", "bloody", "blotchy", "blow", "blunder", "blundering", "blunders",
    "blunt", "blur", "bluring", "blurred", "blurring", "blurry", "blurs", "blurt", "boastful",
    "boggle", "bogus", "boil", "boiling", "boisterous", "bomb", "bombard", "bombardment",
    "bombastic", "bondage", "bonkers", "bore", "bored", "boredom", "bores", "boring", "botch",
    "bother", "bothered", "bothering", "bothers", "bothersome", "bowdlerize", "boycott",
    "braggart", "bragger", "brainless", "brainwash", "brash", "brashly", "brashness", "brat",
    "bravado", "brazen", "brazenly", "brazenness", "breach", "break", "break-up", "break-ups",
    "breakdown", "breaking", "breaks", "breakup", "breakups", "bribery", "brimstone", "bristle",
    "brittle", "broke", "broken", "broken-hearted", "brood", "browbeat", "bruise", "bruised",
    "bruises", "bruising", "brusque", "brutal", "brutalising", "brutalities", "brutality",
    "brutalize", "brutalizing", "brutally", "brute", "brutish", "bs", "buckle", "bug", "bugging",
    "buggy", "bugs", "bulkier", "bulkiness", "bulky", "bulkyness", "bull****", "bull----",
    "bullies", "bullshit", "bullshyt", "bully", "bullying", "bullyingly", "bum", "bump",
    "bumped", "bumping", "bumpping", "bumps", "bumpy", "bungle", "bungler", "bungling", "bunk",
    "burden", "burdensome", "burdensomely", "burn", "burned", "burning", "burns", "bust",
    "busts", "busybody", "butcher", "butchery", "buzzing", "byzantine", "cackle", "calamities",
    "calamitous", "calamitously", "calamity", "callous", "calumniate", "calumniation",
    "calumnies", "calumnious", "calumniously", "calumny", "cancer", "cancerous", "cannibal",
    "cannibalize", "capitulate", "capricious", "capriciously", "capriciousness", "capsize",
    "careless", "carelessness", "caricature", "carnage", "carp", "cartoonish", "cash-strapped",
    "castigate", "castrated", "casualty", "cataclysm", "cataclysmal", "cataclysmic",
    "cataclysmically", "catastrophe", "catastrophes", "catastrophic", "catastrophically",
    "catastrophies", "caustic", "caustically", "cautionary", "cave", "censure", "chafe",
    "chaff", "chagrin", "challenging", "chaos", "chaotic", "chasten", "chastise", "chastisement",
    "chatter", "chatterbox", "cheap", "cheapen", "cheaply", "cheat", "cheated", "cheater",
    "cheating", "cheats", "checkered", "cheerless", "cheesy", "chide", "childish", "chill",
    "chilly", "chintzy", "choke", "choleric", "choppy", "chore", "chronic", "chunky", "clamor",
    "clamorous", "clash", "cliche", "cliched", "clique", "clog", "clogged", "clogs", "cloud",
    "clouding", "cloudy", "clueless", "clumsy", "clunky", "coarse", "cocky", "coerce",
    "coercion", "coercive", "cold", "coldly", "collapse", "collude", "collusion", "combative",
    "combust", "comical", "commiserate", "commonplace", "commotion", "commotions",
    "complacent", "complain", "complained", "complaining", "complains", "complaint",
    "complaints", "complex", "complicated", "complication", "complicit", "compulsion",
    "compulsive", "concede", "conceded", "conceit", "conceited", "concen", "concens",
    "concern", "concerned", "concerns", "concession", "concessions", "condemn", "condemnable", "condemnation",
    "condemned", "condemns", "condescend", "condescending", "condescendingly",
    "condescension", "confess", "confession", "confessions", "confined", "conflict",
    "conflicted", "conflicting", "conflicts", "confound", "confounded", "confounding",
    "confront", "confrontation", "confrontational", "confuse", "confused", "confuses",
    "confusing", "confusion", "confusions", "congested", "congestion", "cons",
    "conscons", "conservative", "conspicuous", "conspicuously", "conspiracies",
    "conspiracy", "conspirator", "conspiratorial", "conspire", "consternation",
    "contagious", "contaminate", "contaminated", "contaminates", "contaminating",
    "contamination", "contempt", "contemptible", "contemptuous", "contemptuously",
    "contend", "contention", "contentious", "contort", "contortions", "contradict",
    "contradiction", "contradictory", "contrariness", "contravene", "contrive",
    "contrived", "controversial", "controversy", "convoluted", "corrode", "corrosion",
    "corrosions", "corrosive", "corrupt", "corrupted", "corrupting", "corruption",
    "corrupts", "corruptted", "costlier", "costly", "counter-productive",
    "counterproductive", "coupists", "covetous", "coward", "cowardly", "crabby",
    "crack", "cracked", "cracks", "craftily", "craftly", "crafty", "cramp", "cramped",
    "cramping", "cranky", "crap", "crappy", "craps", "crash", "crashed", "crashes",
    "crashing", "crass", "craven", "cravenly", "craze", "crazily", "craziness",
    "crazy", "creak", "creaking", "creaks", "credulous", "creep", "creeping",
    "creeps", "creepy", "crept", "crime", "criminal", "cringe", "cringed",
    "cringes", "cripple", "crippled", "cripples", "crippling", "crisis", "critic",
    "critical", "criticism", "criticisms", "criticize", "criticized", "criticizing",
    "critics", "cronyism", "crook", "crooked", "crooks", "crowded", "crowdedness",
    "crude", "cruel", "crueler", "cruelest", "cruelly", "cruelness", "cruelties",
    "cruelty", "crumble", "crumbling", "crummy", "crumple", "crumpled", "crumples",
    "crush", "crushed", "crushing", "cry", "culpable", "culprit", "cumbersome",
    "cunt", "cunts", "cuplrit", "curse", "cursed", "curses", "curt", "cuss",
    "cussed", "cutthroat", "cynical", "cynicism", "d*mn", "damage", "damaged",
    "damages", "damaging", "damn", "damnable", "damnably", "damnation", "damned",
    "damning", "damper", "danger", "dangerous", "dangerousness", "dark", "darken",
    "darkened", "darker", "darkness", "dastard", "dastardly", "daunt", "daunting",
    "dauntingly", "dawdle", "daze", "dazed", "dead", "deadbeat", "deadlock", "deadly",
    "deadweight", "deaf", "dearth", "death", "debacle", "debase", "debasement",
    "debaser", "debatable", "debauch", "debaucher", "debauchery", "debilitate",
    "debilitating", "debility", "debt", "debts", "decadence", "decadent", "decay",
    "decayed", "deceit", "deceitful", "deceitfully", "deceitfulness", "deceive",
    "deceiver", "deceivers", "deceiving", "deception", "deceptive", "deceptively",
    "declaim", "decline", "declines", "declining", "decrement", "decrepit",
    "decrepitude", "decry", "defamation", "defamations", "defamatory", "defame",
    "defect", "defective", "defects", "defensive", "defiance", "defiant",
    "defiantly", "deficiencies", "deficiency", "deficient", "defile", "defiler",
    "deform", "deformed", "defrauding", "defunct", "defy", "degenerate",
    "degenerately", "degeneration", "degradation", "degrade", "degrading",
    "degradingly", "dehumanization", "dehumanize", "deign", "deject", "dejected",
    "dejectedly", "dejection", "delay", "delayed", "delaying", "delays",
    "delinquency", "delinquent", "delirious", "delirium", "delude", "deluded",
    "deluge", "delusion", "delusional", "delusions", "demean", "demeaning",
    "demise", "demolish", "demolisher", "demon", "demonic", "demonize",
    "demonized", "demonizes", "demonizing", "demoralize", "demoralizing",
    "demoralizingly", "denial", "denied", "denies", "denigrate", "denounce",
    "dense", "dent", "dented"
]


In [51]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import spacy
from spacy.tokens import Token
from negspacy.negation import Negex
from negspacy.termsets import termset

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Set up the termset and get patterns
ts = termset("en")

# Register the 'negex' extension on tokens (force=True to avoid extension errors)
Token.set_extension("negex", default=False, force=True)

# Add Negex component with configuration using termset patterns
nlp.add_pipe("negex", last=True, config={
    "neg_termset": ts.get_patterns(),
    "ent_types": ["PERSON", "NOUN"],  # Relevant entity types for negation analysis
    "chunk_prefix": ["no", "not", "never", "without"]  # Define common negation indicators
})

# Define a list of negative words
#negative_words = ["bad", "worse", "worst", "terrible", "awful", "hate", "dislike", "not good", "no", "never"]

# Load training data
trainData = pd.read_csv('/content/drive/MyDrive/finalDataset.csv')
trainData = trainData.dropna()
print(trainData.head())

# Clean tweets
clean_tweets = trainData['cleaned_text']
print(clean_tweets)

# Analyze negation with Negex and negative words
def analyze_negation(text):
    doc = nlp(text)
    analyzed_tokens = []

    for token in doc:
        # Check if the token is a negative word
        if token.text.lower() in negative_words:
            analyzed_tokens.append(token.text + "NEG")  # Mark as negative word
        elif token._.negex:
            analyzed_tokens.append(token.text + "NEG")  # Mark as negated
        else:
            analyzed_tokens.append(token.text)  # Keep the token as is

    return " ".join(analyzed_tokens)

# Apply Negex analysis to cleaned tweets
clean_tweets = clean_tweets.apply(analyze_negation)

# Vectorize tweets
vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 2))
train_data_features = vectorizer.fit_transform(clean_tweets)
print(train_data_features.shape)

# Get labels
y_train = trainData['sentiment']

# Train model
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model.fit(train_data_features, y_train)

# Load test data
testData = pd.read_csv('/content/drive/MyDrive/finalDatasetTest.csv')
testData = testData.dropna()
print(testData.head())

# Clean test tweets
clean_tweets_test = testData['cleaned_text']
print(clean_tweets_test)

# Analyze negation on test data
clean_tweets_test = clean_tweets_test.apply(analyze_negation)

# Get true labels
y_test = testData['sentiment']

# Vectorize test data
test_data_features = vectorizer.transform(clean_tweets_test)
result = model.predict(test_data_features)

# Create output DataFrame
output = pd.DataFrame(data={"id": testData["textID"], "sentiment": result})

# Save predictions to CSV
output.to_csv("resultPredictionsOnTest.csv", index=False, quoting=3)

# Calculate and print metrics
accuracy = accuracy_score(y_test, result)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test, result, average='weighted', zero_division=0)
recall = recall_score(y_test, result, average='weighted', zero_division=0)
f1 = f1_score(y_test, result, average='weighted', zero_division=0)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Print a classification report
print("\nClassification Report:")
print(classification_report(y_test, result))





       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  \
0  Afghanistan          38928346         652860.0   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0        1246700.0   

   Density (P/Km²)                                      