In [42]:
#Task A
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

df = pd.read_csv('train (5).csv')

disaster_tweets = df[df['target'] == 1]
normal_tweets = df[df['target'] == 0]

print("normal" ,normal_tweets)
print("disaster" , disaster_tweets)



normal          id  keyword location  \
15       23      NaN      NaN   
16       24      NaN      NaN   
17       25      NaN      NaN   
18       26      NaN      NaN   
19       28      NaN      NaN   
...     ...      ...      ...   
7581  10833  wrecked  Lincoln   
7582  10834  wrecked      NaN   
7584  10837      NaN      NaN   
7587  10841      NaN      NaN   
7593  10848      NaN      NaN   

                                                   text  target  
15                                       What's up man?       0  
16                                        I love fruits       0  
17                                     Summer is lovely       0  
18                                    My car is so fast       0  
19                         What a goooooooaaaaaal!!!!!!       0  
...                                                 ...     ...  
7581  @engineshed Great atmosphere at the British Li...       0  
7582  Cramer: Iger's 3 words that wrecked Disney's s...       0  
75

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:

def preprocess_tweets(tweets):
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = []
    for tweet in tweets:

        words = word_tokenize(tweet.lower())
        words = [word for word in words if word.isalpha()]
        words = [word for word in words if word not in stopwords.words('english')]

        words = [lemmatizer.lemmatize(word) for word in words]
        tokens.extend(words)
    return tokens

disaster_tokens = preprocess_tweets(disaster_tweets['text'])
print(disaster_tokens)

normal_tokens = preprocess_tweets(normal_tweets['text'])
print(normal_tokens)




In [41]:

disaster_tokens = preprocess_tweets(disaster_tweets['text'])
normal_tokens = preprocess_tweets(normal_tweets['text'])

disaster_word_count = Counter(disaster_tokens)
print("disaster", disaster_word_count)

normal_word_count = Counter(normal_tokens)
print("normal", normal_word_count)




In [None]:

top_20_disaster_words = disaster_word_count.most_common(20)
top_20_normal_words = normal_word_count.most_common(20)

print("Top 20 disaster words:", top_20_disaster_words)
print("Top 20 normal words:", top_20_normal_words)

def get_top_ngrams(tokens, n=2, top_n=20):
    n_grams = ngrams(tokens, n)
    n_gram_freq = Counter(n_grams)
    return n_gram_freq.most_common(top_n)


top_20_disaster_bigrams = get_top_ngrams(disaster_tokens, n=2, top_n=20)
top_20_disaster_trigrams = get_top_ngrams(disaster_tokens, n=3, top_n=20)
top_20_normal_bigrams = get_top_ngrams(normal_tokens, n=2, top_n=20)
top_20_normal_trigrams = get_top_ngrams(normal_tokens, n=3, top_n=20)


print("Top 20 disaster bigrams:", top_20_disaster_bigrams)
print("Top 20 disaster trigrams:", top_20_disaster_trigrams)
print("Top 20 normal bigrams:", top_20_normal_bigrams)
print("Top 20 normal trigrams:", top_20_normal_trigrams)


In [None]:
#Task B
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation

nltk.download('stopwords')

df = pd.read_csv('train (5).csv')

disaster_tweets = df[df['target'] == 1]
normal_tweets = df[df['target'] == 0]

def preprocess_tweet(tweet):
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = ''.join([char for char in tweet if char not in punctuation])
    tweet = tweet.lower()
    stop_words = set(stopwords.words('english'))
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet


disaster_tweets['text'] = disaster_tweets['text'].apply(preprocess_tweet)
normal_tweets['text'] = normal_tweets['text'].apply(preprocess_tweet)

print("Disaster tweets:", disaster_tweets['text'])
print("Normal tweets:", normal_tweets['text'])



In [34]:
# Task C
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

train_df = pd.read_csv('train.csv')


X_train, X_val, y_train, y_val = train_test_split(train_df['review'], train_df['sentiment'], test_size=0.2, random_state=42)

def train_and_evaluate(max_features):
    # Vectorizing text data
    vectorizer = CountVectorizer(max_features=max_features)
    X_train_vect = vectorizer.fit_transform(X_train)
    X_val_vect = vectorizer.transform(X_val)


    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_vect, y_train)

    y_pred = lr_model.predict(X_val_vect)

    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, pos_label='positive')
    recall = recall_score(y_val, y_pred, pos_label='positive')

    return accuracy, f1, recall

results_100 = train_and_evaluate(100)
results_1000 = train_and_evaluate(1000)

print("results_100" ,results_100)
print("results_1000" , results_1000)




results_100 (0.733625, 0.7349172782684413, 0.7539561000510464)
results_1000 (0.865, 0.8639798488664988, 0.8754466564573762)
