In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


train_df=pd.read_csv("../data/train_data.csv")
valid_df=pd.read_csv("../data/valid_data.csv")



from scripts.data_cleaning import clean_text

if "clean_text" not in train_df.columns:
    train_df["clean_text"]=train_df["text"].astype(str).apply(clean_text)
    valid_df["clean_text"]=valid_df["text"].astype(str).apply(clean_text)

print("Data loaded and cleaned")
print(train_df[["text", "clean_text"]].head())




Data loaded and cleaned
                                                text  \
0  Here are Thursday's biggest analyst calls: App...   
1  Buy Las Vegas Sands as travel to Singapore bui...   
2  Piper Sandler downgrades DocuSign to sell, cit...   
3  Analysts react to Tesla's latest earnings, bre...   
4  Netflix and its peers are set for a â€˜return to...   

                                          clean_text  
0  thursday biggest analyst call apple amazon tes...  
1  buy la vega sand travel singapore build well f...  
2  piper sandler downgrade docusign sell citing e...  
3  analyst react tesla latest earnings break what...  
4  netflix peer set return growth analyst say giv...  


In [6]:
print("Project root:", PROJECT_ROOT)
import scripts
print("scripts module found at:", scripts.__file__)


Project root: h:\twitter_analysis
scripts module found at: h:\twitter_analysis\scripts\__init__.py


In [4]:
#TF-IDF Vectorization

tfidf=TfidfVectorizer(max_features=5000)

X_train_tfidf=tfidf.fit_transform(train_df["clean_text"])
X_valid_tfidf=tfidf.transform(valid_df["clean_text"])

y_train=train_df["label"]
y_valid=valid_df["label"]

print("TF-IDF Feature Shape: ", X_train_tfidf.shape,X_valid_tfidf.shape)

with open("../models/tfidf_vectorizer.pkl","wb") as f:
    pickle.dump(tfidf,f)
print("TF-IDF Veactorizer saved.")

TF-IDF Feature Shape:  (16990, 5000) (4117, 5000)
TF-IDF Veactorizer saved.


In [None]:
#Keras Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

VOCAB_SIZE=5000
MAX_LENGTH=50

tokenizer=Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["clean_text"])

X_train_seq=tokenizer.texts_to_sequences(train_df["clean_text"])
X_valid_seq=tokenizer.texts_to_sequences(valid_df["clean_text"])

X_train_padded=pad_sequences(X_train_seq, maxlen=MAX_LENGTH, padding="post", truncating="post")
X_valid_padded=pad_sequences(X_valid_seq, maxlen=MAX_LENGTH, padding="post", truncating="post")

print("Padded Sequences Shape: ", X_train_padded.shape, X_valid_padded.shape)

with open("../models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("tokenizer saved")


Padded Sequences Shape:  (16990, 50) (4117, 50)
tokenizer saved
