In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


In [2]:
spark = SparkSession.builder.appName("tweets").getOrCreate()

In [3]:
schema = StructType([
    StructField('target', IntegerType(), True),
    StructField('id', IntegerType(), True),
    StructField('date', StringType(), True),
    StructField('flag', StringType(), True),
    StructField('user', StringType(), True),
    StructField('text', StringType(), True)
])

In [4]:
df = spark.read.csv("../database", schema=schema)
df.show(n=5)

+------+----------+--------------------+--------+---------------+--------------------+
|target|        id|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [5]:
df = df.select("target", "text")
df.show(n=5)

+------+--------------------+
|target|                text|
+------+--------------------+
|     0|@switchfoot http:...|
|     0|is upset that he ...|
|     0|@Kenichan I dived...|
|     0|my whole body fee...|
|     0|@nationwideclass ...|
+------+--------------------+
only showing top 5 rows



In [6]:
df.groupBy("target").count().show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



In [7]:
from pyspark.sql.functions import when
change_target = when(df['target'] == 4, 1).otherwise(df['target'])
df = df.withColumn('target', change_target)

In [8]:
df.groupBy("target").count().show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     1|800000|
+------+------+



In [9]:
null_values = df.filter(df['text'].isNull()).count()
null_values

0

In [10]:
import re
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # Remove URLs, mentions, and hashtags
    tweet = re.sub(r'@\w+|\w+://\S+|(#\S+)', '', tweet)
    # Remove non-letters e.g punctuation, numbers
    tweet = re.sub(r'[^a-zA-Z\s]+', '', tweet) 
    return tweet  

In [11]:
from pyspark.sql.functions import udf
preprocess_udf = udf(preprocess_tweet, StringType())
new_df = df.withColumn('text', preprocess_udf(df['text']))
new_df.show(n=5)

+------+--------------------+
|target|                text|
+------+--------------------+
|     0|   awww thats a b...|
|     0|is upset that he ...|
|     0| i dived many tim...|
|     0|my whole body fee...|
|     0| no its not behav...|
+------+--------------------+
only showing top 5 rows



In [12]:
rows = df.select('text').collect()
print(rows[8]["text"])

new_rows = new_df.select('text').collect()
print(new_rows[8]["text"])

@Tatiana_K nope they didn't have it 
 nope they didnt have it 


In [14]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m896.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m998.1 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m0m
[?25hDownloading regex-2025.11.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl (793 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m793.4/793.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.9.2 regex-2025.11.3


In [15]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def porterStemmerTweet(tweet):
    if tweet is None:
        return None
    new_tweet = ""
    for word in tweet.split(" "):
        new_tweet += ps.stem(word) + " "
    return new_tweet.strip()

In [16]:
new_df.show(n=5)

+------+--------------------+
|target|                text|
+------+--------------------+
|     0|   awww thats a b...|
|     0|is upset that he ...|
|     0| i dived many tim...|
|     0|my whole body fee...|
|     0| no its not behav...|
+------+--------------------+
only showing top 5 rows



In [17]:
preprocess_stem_udf = udf(porterStemmerTweet, StringType())
new_df = new_df.withColumn('text', preprocess_stem_udf(new_df['text']))
new_df.show(n=5)

+------+--------------------+
|target|                text|
+------+--------------------+
|     0|awww that a bumme...|
|     0|is upset that he ...|
|     0|i dive mani time ...|
|     0|my whole bodi fee...|
|     0|no it not behav a...|
+------+--------------------+
only showing top 5 rows



In [20]:
df.coalesce(1) \
  .write \
  .option("header", "true") \
  .mode("overwrite") \
  .csv("./database/new_df")

                                                                                

In [18]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF

In [19]:
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df_tokens = tokenizer.transform(df)
# df_tokens.show(truncate=False)

df_remover = remover.transform(df_tokens)
df_remover.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
|target|text                                                                                                                 |tokens                                                                                                                                       |filtered_tokens                                                                                             |
+------+---------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------

In [20]:
hashing_tf = HashingTF(
    inputCol="filtered_tokens",
    outputCol="raw_features",
    numFeatures=2**14  # you can tune this
)
hashing_df = hashing_tf.transform(df_remover)
hashing_df.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|target|text                                                                                                                 |tokens                                                                                                                                       |filtered_tokens                                                                                             |raw_features                                                                                  

In [21]:
from pyspark.ml.classification import LinearSVC


In [22]:
train_df, test_df = hashing_df.randomSplit([0.9, 0.1], seed=42)

## Linear SVC

In [28]:
svm = LinearSVC(
    featuresCol="raw_features",
    labelCol="target",
    maxIter=200,
    regParam=0.1
)

In [29]:
model = svm.fit(train_df)

In [30]:
predictions = model.transform(test_df)
predictions.select("target", "prediction", "rawPrediction").show(10, truncate=False)

+------+----------+------------------------------------------+
|target|prediction|rawPrediction                             |
+------+----------+------------------------------------------+
|0     |1.0       |[-0.03887278397316299,0.03887278397316299]|
|0     |0.0       |[0.5376788738983288,-0.5376788738983288]  |
|0     |0.0       |[0.431778074163398,-0.431778074163398]    |
|0     |0.0       |[1.1223644960518568,-1.1223644960518568]  |
|0     |0.0       |[0.12539461613773975,-0.12539461613773975]|
|0     |1.0       |[-0.19429277242949577,0.19429277242949577]|
|0     |0.0       |[0.5925933948435009,-0.5925933948435009]  |
|0     |0.0       |[1.4966230842864516,-1.4966230842864516]  |
|0     |0.0       |[3.3830066164279016,-3.3830066164279016]  |
|0     |1.0       |[-1.8526332937966976,1.8526332937966976]  |
+------+----------+------------------------------------------+
only showing top 10 rows



In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="target",
    rawPredictionCol="rawPrediction",   # default for LinearSVC & LogisticRegression
    metricName="areaUnderROC"           # or "areaUnderPR"
)

auc = evaluator.evaluate(predictions)
print("AUC (ROC):", auc)

AUC (ROC): 0.8208948763298934


## Logistic Regression

In [32]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="raw_features", labelCol="target",maxIter=200, regParam=0.01)


In [33]:
model = lr.fit(train_df)

In [34]:
predictions = model.transform(test_df)
predictions.select("target", "prediction", "rawPrediction").show(10, truncate=False)

+------+----------+------------------------------------------+
|target|prediction|rawPrediction                             |
+------+----------+------------------------------------------+
|0     |0.0       |[0.1757218500903969,-0.1757218500903969]  |
|0     |0.0       |[0.8584967906132306,-0.8584967906132306]  |
|0     |0.0       |[0.7519899781412592,-0.7519899781412592]  |
|0     |0.0       |[1.5040557172354578,-1.5040557172354578]  |
|0     |0.0       |[0.2632813565686486,-0.2632813565686486]  |
|0     |1.0       |[-0.11458124279981874,0.11458124279981874]|
|0     |0.0       |[1.0276917144518487,-1.0276917144518487]  |
|0     |0.0       |[2.045554470332043,-2.045554470332043]    |
|0     |0.0       |[5.00948751956508,-5.00948751956508]      |
|0     |1.0       |[-2.217371040709446,2.217371040709446]    |
+------+----------+------------------------------------------+
only showing top 10 rows



In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="target",
    rawPredictionCol="rawPrediction",   # default for LinearSVC & LogisticRegression
    metricName="areaUnderROC"           # or "areaUnderPR"
)

auc = evaluator.evaluate(predictions)
print("AUC (ROC):", auc)

AUC (ROC): 0.8222536026933774


## RandomForrest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="raw_features", labelCol="target", numTrees=100)