In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/MyDrive/CMPSC448/MidtermProject")

In [3]:
import numpy as np
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import unicodedata
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('/content/drive/MyDrive/CMPSC448/MidtermProject/train.txt', sep=' ', header=None)
df.columns = ["TAG", "POS", "CHUNK"]

In [5]:
df.head()

Unnamed: 0,TAG,POS,CHUNK
0,Confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP


In [6]:
df['TAG']

0         Confidence
1                 in
2                the
3              pound
4                 is
             ...    
211722            to
211723           San
211724     Francisco
211725       instead
211726             .
Name: TAG, Length: 211727, dtype: object

In [7]:
df.shape


(211727, 3)

In [8]:
def calculate_freq_normal(freq, df):
  count = 0
  for word in df["TAG"]:
    count += 1
    if(word in freq):
      freq[word] += 1
    else:
      freq[word] = 1

  for key in freq:
    freq[key] = freq[key] / count

In [9]:
df.dropna(inplace=True)



In [10]:
blanks_ids = []
for i, token, tag, chunk in df.itertuples():
  if type(token) == str:
    if token.isspace():
      blanks_ids.append(i)

df.drop(blanks_ids, inplace=True)
print(len(blanks_ids))

for i, token, tag, chunk in df.itertuples():
  # Replace accented characters
  text = unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8', 'ignore')

      # Lemmatize
      # df["TAG"][i] = lemmatizer.lemmatize(text.strip())
df["TAG"][i] = text.strip()


0


In [11]:
df.shape

(211727, 3)

In [12]:
# 2. Feature Extraction
def extract_features(sentence, index):
    return {
        'word': sentence[index][0],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index][0].upper() == sentence[index][0],
        'is_all_lower': sentence[index][0].lower() == sentence[index][0],
        'prefix-1': sentence[index][0][0],
        'suffix-1': sentence[index][0][-1],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
    }

In [13]:
from string import punctuation


In [14]:
# Feature Extraction Function
def define_new_features(df, index, freq):
    return {
        'token': df[index],
        'length': len(df[index]),
        'is_proper': df[index].istitle(),
        'is_all_caps': df[index].isupper(),
        'is_all_lower': df[index].islower(),
        "contains_number": bool(re.search(r'\d', df[index])),
        'has_punctuation': any(p in df[index] for p in punctuation),
        "plural_s": len(df[index]) >= 3 and df[index].endswith("s"),
        'prev_word': '' if index == 0 else df[index - 1],
        'next_word': '' if index == len(df) - 1 else df[index + 1],
        'preceeds_punctuation': False if index == len(df) - 1 else any(p == df[index + 1] for p in punctuation),
        "has_es_suffix": df[index].endswith("es"),
        "has_er_suffix": df[index].endswith("er"),
        "has_ed_suffix": df[index].endswith("ed"),
        "has_ing_suffix": df[index].endswith("ing"),
        "has_able_suffix": df[index].endswith("able"),
        "has_ion_suffix": df[index].endswith("ion"),
        "suffix-2": df[index][-2:],
        "prefix-2": df[index][:2],
        "freq": freq[df[index]]
    }

In [15]:
from sklearn.preprocessing import LabelEncoder
freq = {}
calculate_freq_normal(freq, df)
X = [define_new_features(df['TAG'], x, freq) for x in range(len(df['TAG']))]
y = [pos for pos in df['POS']]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20)

encoder = LabelEncoder()
encoder.fit(y)
y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [17]:
df_features = pd.DataFrame(X)

# Convert the feature DataFrame to a list of strings
feature_strings = df_features.astype(str).apply(' '.join, axis=1)

# Split the data into training and testing sets
X_train_stack, X_test_stack, y_train_stack, y_test_stack = train_test_split(
    feature_strings, y, test_size=0.20, random_state=42)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_stack)
X_test_tfidf = tfidf_vectorizer.transform(X_test_stack)
# Count Vectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train_stack)
X_test_count = count_vectorizer.transform(X_test_stack)

# Horizontally stack TF-IDF and Count vectors
X_train_combined = hstack([X_train_tfidf, X_train_count])
X_test_combined = hstack([X_test_tfidf, X_test_count])


In [19]:
from sklearn.feature_extraction import FeatureHasher

hasher = FeatureHasher(n_features=2**18)
X_train = hasher.transform(X_train)
X_test = hasher.transform(X_test)

In [21]:
X_train.shape


(169381, 262144)

In [22]:
X_train_combined.shape

(169381, 30752)

In [None]:
from sklearn.svm import SVC

# clf = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('classifier', SVC(kernel='linear', random_state=42))
# ])

clf = SVC(kernel='linear', random_state=42)

clf.fit(X_train, y_train_encoded)

y_pred = clf.predict(X_test)

In [None]:
print("======Accuracy Score======")
print(accuracy_score(y_test_encoded, y_pred))
print("======Classification Report======")
print(classification_report(y_test_encoded, y_pred))

In [None]:
#SVC using stacked

In [26]:
from sklearn.svm import SVC

# clf = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('classifier', SVC(kernel='linear', random_state=42))
# ])

clf = SVC(kernel='linear', random_state=42)

clf.fit(X_train_combined, y_train_stack)

y_pred = clf.predict(X_test_combined)

In [27]:
print("======Accuracy Score======")
print(accuracy_score(y_test_stack, y_pred))
print("======Classification Report======")
print(classification_report(y_test_stack, y_pred))

0.9444103339158362


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       0.67      0.40      0.50         5
           $       1.00      0.99      1.00       359
          ''       1.00      0.99      1.00       289
           (       1.00      1.00      1.00        68
           )       1.00      1.00      1.00        68
           ,       1.00      1.00      1.00      2142
           .       1.00      1.00      1.00      1758
           :       0.98      1.00      0.99       220
          CC       1.00      1.00      1.00      1051
          CD       0.97      0.99      0.98      1693
          DT       1.00      0.99      0.99      3611
          EX       1.00      0.98      0.99        48
          FW       1.00      0.60      0.75         5
          IN       0.98      0.99      0.98      4545
          JJ       0.83      0.88      0.86      2643
         JJR       0.91      0.88      0.89       158
         JJS       0.97      1.00      0.98        85
          MD       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#--------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=1000)

# Training the model
clf_log.fit(X_train, y_train_encoded)

# Predicting the POS Tag
y_pred = clf_log.predict(X_test)

In [None]:
print("======Accuracy Score======")
print(accuracy_score(y_test_encoded, y_pred))
print("======Classification Report======")
print(classification_report(y_test_encoded, y_pred))

In [None]:
#using stacked

In [28]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=1000)

# Training the model
clf_log.fit(X_train_combined, y_train_stack)

# Predicting the POS Tag
y_pred = clf_log.predict(X_test_combined)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
print("======Accuracy Score======")
print(accuracy_score(y_test_stack, y_pred))
print("======Classification Report======")
print(classification_report(y_test_stack, y_pred))

0.936499315165541


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       0.67      0.40      0.50         5
           $       1.00      0.99      1.00       359
          ''       1.00      0.98      0.99       289
           (       1.00      1.00      1.00        68
           )       1.00      1.00      1.00        68
           ,       1.00      1.00      1.00      2142
           .       1.00      1.00      1.00      1758
           :       0.99      0.99      0.99       220
          CC       1.00      1.00      1.00      1051
          CD       0.97      0.98      0.98      1693
          DT       0.99      0.99      0.99      3611
          EX       0.96      1.00      0.98        48
          FW       1.00      0.40      0.57         5
          IN       0.97      0.99      0.98      4545
          JJ       0.84      0.83      0.83      2643
         JJR       0.89      0.88      0.89       158
         JJS       0.97      0.89      0.93        85
          MD       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#----------------------------------------------------------------------------------------------------------------

In [23]:
from sklearn.naive_bayes import MultinomialNB
clf_bayes = MultinomialNB()
clf_bayes.fit(X_train_combined, y_train_stack)
y_pred = clf_bayes.predict(X_test_combined)

In [24]:
print("======Accuracy Score======")
print(accuracy_score(y_test_stack, y_pred))
print("======Classification Report======")
print(classification_report(y_test_stack, y_pred))

0.8486751995465923


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #       0.00      0.00      0.00         5
           $       1.00      0.90      0.95       359
          ''       1.00      0.48      0.65       289
           (       0.00      0.00      0.00        68
           )       0.00      0.00      0.00        68
           ,       0.94      1.00      0.97      2142
           .       0.93      1.00      0.96      1758
           :       1.00      0.00      0.01       220
          CC       1.00      0.99      1.00      1051
          CD       0.93      0.89      0.91      1693
          DT       0.93      0.98      0.95      3611
          EX       0.00      0.00      0.00        48
          FW       0.00      0.00      0.00         5
          IN       0.91      0.95      0.93      4545
          JJ       0.67      0.67      0.67      2643
         JJR       0.75      0.35      0.48       158
         JJS       0.00      0.00      0.00        85
          MD       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
x = np.loadtxt('/content/drive/MyDrive/CMPSC448/MidtermProject/unlabeled_test_test.txt', dtype=str)
df_test = pd.DataFrame(data = x[1:], columns=[x[0]])
df_test.head()

Unnamed: 0,TAG
0,@paulwalk
1,It
2,'s
3,the
4,view


In [35]:
blanks_ids = []
for i, token in df_test.itertuples():
  if type(token) == str:
    if token.isspace():
      blanks_ids.append(i)

df_test.drop(blanks_ids, inplace=True)
print(len(blanks_ids))

for i, token in df_test.itertuples():
  # Replace accented characters
  text = unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8', 'ignore')

      # Lemmatize
      # df["TAG"][i] = lemmatizer.lemmatize(text.strip())
df_test["TAG"][i] = text.strip()


0


In [36]:
freq = {}
calculate_freq_normal(freq, df_test)
X_Test = [define_new_features(df_test['TAG'], x, freq) for x in range(len(df_test['TAG']))]

X_Test = hasher.transform(X_Test)

In [37]:
y_Pred = []
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train_encoded)


y_Pred = clf.predict(X_Test)
y_Pred = encoder.inverse_transform(y_Pred)
y_Pred[:5]

array(['JJ', 'PRP', 'VBZ', 'DT', 'NN'], dtype='<U4')

In [38]:
final = []
for i in range(len(y_Pred)):
  final.append(df_test['TAG'][i] + " " + y_Pred[i])

with open("/content/drive/MyDrive/CMPSC448/MidtermProject/predictions.txt", 'w') as file:
  for pred in final:
    file.write(pred + "\n")
  file.close()